X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fzdf.py;h=9a3331a693b446203603acc73fb64800d77bbd3b;hb=9f0ee2a3883ec6f6fdccba90085cb925aaa2f617;hp=829f002cf02f9c908a5057ab3c6b20f520e2f2ea;hpb=61224dbcdd46052b264e422cc45da907fb06fd42;p=youtube-dl diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 829f002cf..9a3331a69 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -1,111 +1,186 @@ # coding: utf-8 +from __future__ import unicode_literals +import functools import re from .common import InfoExtractor from ..utils import ( int_or_none, unified_strdate, + OnDemandPagedList, + xpath_text, ) +def extract_from_xml_url(ie, video_id, xml_url): + doc = ie._download_xml( + xml_url, video_id, + note='Downloading video info', + errnote='Failed to download video info') + + title = doc.find('.//information/title').text + description = xpath_text(doc, './/information/detail', 'description') + duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration')) + uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader') + uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id') + upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date')) + + def xml_to_format(fnode): + video_url = fnode.find('url').text + is_available = 'http://www.metafilegenerator' not in video_url + + format_id = fnode.attrib['basetype'] + format_m = re.match(r'''(?x) + (?P[^_]+)_(?P[^_]+)_(?P[^_]+)_ + (?P[^_]+)_(?P[^_]+)_(?P[^_]+) + ''', format_id) + + ext = format_m.group('container') + proto = format_m.group('proto').lower() + + quality = xpath_text(fnode, './quality', 'quality') + abr = int_or_none(xpath_text(fnode, './audioBitrate', 'abr'), 1000) + vbr = int_or_none(xpath_text(fnode, './videoBitrate', 'vbr'), 1000) + + width = int_or_none(xpath_text(fnode, './width', 'width')) + height = int_or_none(xpath_text(fnode, './height', 'height')) + + filesize = int_or_none(xpath_text(fnode, './filesize', 'filesize')) + + format_note = '' + if not format_note: + format_note = None + + return { + 'format_id': format_id + '-' + quality, + 'url': video_url, + 'ext': ext, + 'acodec': format_m.group('acodec'), + 'vcodec': format_m.group('vcodec'), + 'abr': abr, + 'vbr': vbr, + 'width': width, + 'height': height, + 'filesize': filesize, + 'format_note': format_note, + 'protocol': proto, + '_available': is_available, + } + + def xml_to_thumbnails(fnode): + thumbnails = [] + for node in fnode: + thumbnail_url = node.text + if not thumbnail_url: + continue + thumbnail = { + 'url': thumbnail_url, + } + if 'key' in node.attrib: + m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key']) + if m: + thumbnail['width'] = int(m.group(1)) + thumbnail['height'] = int(m.group(2)) + thumbnails.append(thumbnail) + return thumbnails + + thumbnails = xml_to_thumbnails(doc.findall('.//teaserimages/teaserimage')) + + format_nodes = doc.findall('.//formitaeten/formitaet') + formats = list(filter( + lambda f: f['_available'], + map(xml_to_format, format_nodes))) + ie._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'thumbnails': thumbnails, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'upload_date': upload_date, + 'formats': formats, + } + + class ZDFIE(InfoExtractor): - _VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P#)?/(.*beitrag/(?:video/)?)(?P[0-9]+)(?:/[^/?]+)?(?:\?.*)?' + _VALID_URL = r'(?:zdf:|zdf:video:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/(.*beitrag/(?:video/)?))(?P[0-9]+)(?:/[^/?]+)?(?:\?.*)?' _TEST = { - u"url": u"http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt", - u"file": u"2037704.webm", - u"info_dict": { - u"upload_date": u"20131127", - u"description": u"Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial \"Ende des Machtpokers - Große Koalition für Deutschland\".", - u"uploader": u"spezial", - u"title": u"ZDFspezial - Ende des Machtpokers" + 'url': 'http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt', + 'info_dict': { + 'id': '2037704', + 'ext': 'webm', + 'title': 'ZDFspezial - Ende des Machtpokers', + 'description': 'Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial "Ende des Machtpokers - Große Koalition für Deutschland".', + 'duration': 1022, + 'uploader': 'spezial', + 'uploader_id': '225948', + 'upload_date': '20131127', }, - u"skip": u"Videos on ZDF.de are depublicised in short order", + 'skip': 'Videos on ZDF.de are depublicised in short order', } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('video_id') + video_id = self._match_id(url) + xml_url = 'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id + return extract_from_xml_url(self, video_id, xml_url) + - xml_url = u'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id +class ZDFChannelIE(InfoExtractor): + _VALID_URL = r'(?:zdf:topic:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/.*kanaluebersicht/(?:[^/]+/)?)(?P[0-9]+)' + _TESTS = [{ + 'url': 'http://www.zdf.de/ZDFmediathek#/kanaluebersicht/1586442/sendung/Titanic', + 'info_dict': { + 'id': '1586442', + }, + 'playlist_count': 3, + }, { + 'url': 'http://www.zdf.de/ZDFmediathek/kanaluebersicht/aktuellste/332', + 'only_matching': True, + }, { + 'url': 'http://www.zdf.de/ZDFmediathek/kanaluebersicht/meist-gesehen/332', + 'only_matching': True, + }, { + 'url': 'http://www.zdf.de/ZDFmediathek/kanaluebersicht/_/1798716?bc=nrt;nrm?flash=off', + 'only_matching': True, + }] + _PAGE_SIZE = 50 + + def _fetch_page(self, channel_id, page): + offset = page * self._PAGE_SIZE + xml_url = ( + 'http://www.zdf.de/ZDFmediathek/xmlservice/web/aktuellste?ak=web&offset=%d&maxLength=%d&id=%s' + % (offset, self._PAGE_SIZE, channel_id)) doc = self._download_xml( - xml_url, video_id, - note=u'Downloading video info', - errnote=u'Failed to download video info') + xml_url, channel_id, + note='Downloading channel info', + errnote='Failed to download channel info') title = doc.find('.//information/title').text description = doc.find('.//information/detail').text - uploader_node = doc.find('.//details/originChannelTitle') - uploader = None if uploader_node is None else uploader_node.text - duration_str = doc.find('.//details/length').text - duration_m = re.match(r'''(?x)^ - (?P[0-9]{2}) - :(?P[0-9]{2}) - :(?P[0-9]{2}) - (?:\.(?P[0-9]+)?) - ''', duration_str) - duration = ( - ( - (int(duration_m.group('hours')) * 60 * 60) + - (int(duration_m.group('minutes')) * 60) + - int(duration_m.group('seconds')) - ) - if duration_m - else None - ) - upload_date = unified_strdate(doc.find('.//details/airtime').text) - - def xml_to_format(fnode): - video_url = fnode.find('url').text - is_available = u'http://www.metafilegenerator' not in video_url - - format_id = fnode.attrib['basetype'] - format_m = re.match(r'''(?x) - (?P[^_]+)_(?P[^_]+)_(?P[^_]+)_ - (?P[^_]+)_(?P[^_]+)_(?P[^_]+) - ''', format_id) - - ext = format_m.group('container') - proto = format_m.group('proto').lower() - - quality = fnode.find('./quality').text - abr = int(fnode.find('./audioBitrate').text) // 1000 - vbr = int(fnode.find('./videoBitrate').text) // 1000 - - format_note = u'' - if not format_note: - format_note = None - - return { - 'format_id': format_id + u'-' + quality, - 'url': video_url, - 'ext': ext, - 'acodec': format_m.group('acodec'), - 'vcodec': format_m.group('vcodec'), - 'abr': abr, - 'vbr': vbr, - 'width': int_or_none(fnode.find('./width').text), - 'height': int_or_none(fnode.find('./height').text), - 'filesize': int_or_none(fnode.find('./filesize').text), - 'format_note': format_note, - 'protocol': proto, - '_available': is_available, + for asset in doc.findall('.//teasers/teaser'): + a_type = asset.find('./type').text + a_id = asset.find('./details/assetId').text + if a_type not in ('video', 'topic'): + continue + yield { + '_type': 'url', + 'playlist_title': title, + 'playlist_description': description, + 'url': 'zdf:%s:%s' % (a_type, a_id), } - format_nodes = doc.findall('.//formitaeten/formitaet') - formats = list(filter( - lambda f: f['_available'], - map(xml_to_format, format_nodes))) - - self._sort_formats(formats) + def _real_extract(self, url): + channel_id = self._match_id(url) + entries = OnDemandPagedList( + functools.partial(self._fetch_page, channel_id), self._PAGE_SIZE) return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'description': description, - 'uploader': uploader, - 'duration': duration, - 'upload_date': upload_date, + '_type': 'playlist', + 'id': channel_id, + 'entries': entries, }