X-Git-Url: http://git.bitcoin.ninja/index.cgi?p=youtube-dl;a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fwdr.py;h=f7e6360a33e8b6d2cc3096232bfa1d2c458ab3c7;hp=73a343c69bc0fe94cdf767e85a5444d2d91f0bd6;hb=dcdb292fddc82ae11f4c0b647815a45c88a6b6d5;hpb=37f972954da0d0f1f0c5e97da8357c4baf687ee6 diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 73a343c69..f7e6360a3 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -1,31 +1,109 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urlparse, -) from ..utils import ( + determine_ext, + ExtractorError, + js_to_json, strip_jsonp, unified_strdate, - ExtractorError, + update_url_query, + urlhandle_detect_ext, ) -class WDRIE(InfoExtractor): - _CURRENT_MAUS_URL = r'https?://www.wdrmaus.de/aktuelle-sendung/(wdr|index).php5' - _PAGE_REGEX = r'/mediathek/(?P[^/]+)/(?P[^/]+)/(?P.+)\.html' - _VALID_URL = r'(?Phttps?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL +class WDRBaseIE(InfoExtractor): + def _extract_wdr_video(self, webpage, display_id): + # for wdr.de the data-extension is in a tag with the class "mediaLink" + # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" + # for wdrmaus its in a link to the page in a multiline "videoLink"-tag + json_metadata = self._html_search_regex( + r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"', + webpage, 'media link', default=None, flags=re.MULTILINE) + + if not json_metadata: + return + + media_link_obj = self._parse_json(json_metadata, display_id, + transform_source=js_to_json) + jsonp_url = media_link_obj['mediaObj']['url'] + + metadata = self._download_json( + jsonp_url, 'metadata', transform_source=strip_jsonp) + + metadata_tracker_data = metadata['trackerData'] + metadata_media_resource = metadata['mediaResource'] + + formats = [] + + # check if the metadata contains a direct URL to a file + for kind, media_resource in metadata_media_resource.items(): + if kind not in ('dflt', 'alt'): + continue + + for tag_name, medium_url in media_resource.items(): + if tag_name not in ('videoURL', 'audioURL'): + continue + + ext = determine_ext(medium_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + medium_url, display_id, 'mp4', 'm3u8_native', + m3u8_id='hls')) + elif ext == 'f4m': + manifest_url = update_url_query( + medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'}) + formats.extend(self._extract_f4m_formats( + manifest_url, display_id, f4m_id='hds', fatal=False)) + elif ext == 'smil': + formats.extend(self._extract_smil_formats( + medium_url, 'stream', fatal=False)) + else: + a_format = { + 'url': medium_url + } + if ext == 'unknown_video': + urlh = self._request_webpage( + medium_url, display_id, note='Determining extension') + ext = urlhandle_detect_ext(urlh) + a_format['ext'] = ext + formats.append(a_format) + + self._sort_formats(formats) + + subtitles = {} + caption_url = metadata_media_resource.get('captionURL') + if caption_url: + subtitles['de'] = [{ + 'url': caption_url, + 'ext': 'ttml', + }] + + title = metadata_tracker_data['trackerClipTitle'] + + return { + 'id': metadata_tracker_data.get('trackerClipId', display_id), + 'display_id': display_id, + 'title': title, + 'alt_title': metadata_tracker_data.get('trackerClipSubcategory'), + 'formats': formats, + 'subtitles': subtitles, + 'upload_date': unified_strdate(metadata_tracker_data.get('trackerClipAirTime')), + } - _JS_URL_REGEX = r'(https?://deviceids-medp.wdr.de/ondemand/\d+/\d+\.js)' + +class WDRIE(WDRBaseIE): + _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' + _PAGE_REGEX = r'/(?:mediathek/)?[^/]+/(?P[^/]+)/(?P.+)\.html' + _VALID_URL = r'(?Phttps?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL _TESTS = [ { 'url': 'http://www1.wdr.de/mediathek/video/sendungen/doku-am-freitag/video-geheimnis-aachener-dom-100.html', - 'md5': 'e58c39c3e30077141d258bf588700a7b', + # HDS download, MD5 is unstable 'info_dict': { 'id': 'mdb-1058683', 'ext': 'flv', @@ -36,10 +114,10 @@ class WDRIE(InfoExtractor): 'description': 'md5:87be8ff14d8dfd7a7ee46f0299b52318', 'is_live': False, 'subtitles': {'de': [{ - 'url': 'http://ondemand-ww.wdr.de/medp/fsk0/105/1058683/1058683_12220974.xml' + 'url': 'http://ondemand-ww.wdr.de/medp/fsk0/105/1058683/1058683_12220974.xml', + 'ext': 'ttml', }]}, }, - 'skip': 'Page Not Found', }, { 'url': 'http://www1.wdr.de/mediathek/audio/wdr3/wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100.html', @@ -55,13 +133,12 @@ class WDRIE(InfoExtractor): 'is_live': False, 'subtitles': {} }, - 'skip': 'Page Not Found', }, { 'url': 'http://www1.wdr.de/mediathek/video/live/index.html', 'info_dict': { 'id': 'mdb-103364', - 'ext': 'flv', + 'ext': 'mp4', 'display_id': 'index', 'title': r're:^WDR Fernsehen im Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'alt_title': 'WDR Fernsehen Live', @@ -69,11 +146,14 @@ class WDRIE(InfoExtractor): 'description': 'md5:ae2ff888510623bf8d4b115f95a9b7c9', 'is_live': True, 'subtitles': {} - } + }, + 'params': { + 'skip_download': True, # m3u8 download + }, }, { 'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html', - 'playlist_mincount': 10, + 'playlist_mincount': 8, 'info_dict': { 'id': 'aktuelle-stunde/aktuelle-stunde-120', }, @@ -89,6 +169,28 @@ class WDRIE(InfoExtractor): }, 'skip': 'The id changes from week to week because of the new episode' }, + { + 'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/achterbahn.php5', + 'md5': '803138901f6368ee497b4d195bb164f2', + 'info_dict': { + 'id': 'mdb-186083', + 'ext': 'mp4', + 'upload_date': '20130919', + 'title': 'Sachgeschichte - Achterbahn ', + 'description': '- Die Sendung mit der Maus -', + }, + }, + { + 'url': 'http://www1.wdr.de/radio/player/radioplayer116~_layout-popupVersion.html', + # Live stream, MD5 unstable + 'info_dict': { + 'id': 'mdb-869971', + 'ext': 'flv', + 'title': 'Funkhaus Europa Livestream', + 'description': 'md5:2309992a6716c347891c045be50992e4', + 'upload_date': '20160101', + }, + } ] def _real_extract(self, url): @@ -98,13 +200,13 @@ class WDRIE(InfoExtractor): display_id = mobj.group('display_id') webpage = self._download_webpage(url, display_id) - js_url = self._search_regex(self._JS_URL_REGEX, webpage, 'js_url', default=None) + info_dict = self._extract_wdr_video(webpage, display_id) - if not js_url: + if not info_dict: entries = [ self.url_result(page_url + href[0], 'WDR') for href in re.findall( - r']+data-extension=' % self._PAGE_REGEX, webpage) ] @@ -113,66 +215,22 @@ class WDRIE(InfoExtractor): raise ExtractorError('No downloadable streams found', expected=True) - metadata = self._download_json( - js_url, 'metadata', transform_source=strip_jsonp) - - metadata_tracker_data = metadata['trackerData'] - metadata_media_resource = metadata['mediaResource'] - - formats = [] - - # check if the metadata contains a direct URL to a file - metadata_media_alt = metadata_media_resource.get('alt') - if metadata_media_alt: - for tag_name in ['videoURL', 'audioURL']: - if tag_name in metadata_media_alt: - formats.append({ - 'url': metadata_media_alt[tag_name] - }) - - # check if there are flash-streams for this video - if 'dflt' in metadata_media_resource and 'videoURL' in metadata_media_resource['dflt']: - video_url = metadata_media_resource['dflt']['videoURL'] - if video_url.endswith('.f4m'): - full_video_url = video_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18' - formats.extend(self._extract_f4m_formats(full_video_url, display_id, f4m_id='hds', fatal=False)) - elif video_url.endswith('.smil'): - formats.extend(self._extract_smil_formats(video_url, 'stream', fatal=False)) - - subtitles = {} - caption_url = metadata_media_resource.get('captionURL') - if caption_url: - subtitles['de'] = [{ - 'url': caption_url - }] - - title = metadata_tracker_data.get('trackerClipTitle') is_live = url_type == 'live' if is_live: - title = self._live_title(title) - upload_date = None - elif 'trackerClipAirTime' in metadata_tracker_data: - upload_date = metadata_tracker_data['trackerClipAirTime'] - else: - upload_date = self._html_search_meta('DC.Date', webpage, 'upload date') - - if upload_date: - upload_date = unified_strdate(upload_date) - - self._sort_formats(formats) + info_dict.update({ + 'title': self._live_title(info_dict['title']), + 'upload_date': None, + }) + elif 'upload_date' not in info_dict: + info_dict['upload_date'] = unified_strdate(self._html_search_meta('DC.Date', webpage, 'upload date')) - return { - 'id': metadata_tracker_data.get('trackerClipId', display_id), - 'display_id': display_id, - 'title': title, - 'alt_title': metadata_tracker_data.get('trackerClipSubcategory'), - 'formats': formats, - 'upload_date': upload_date, + info_dict.update({ 'description': self._html_search_meta('Description', webpage), 'is_live': is_live, - 'subtitles': subtitles, - } + }) + + return info_dict class WDRMobileIE(InfoExtractor): @@ -204,72 +262,3 @@ class WDRMobileIE(InfoExtractor): 'User-Agent': 'mobile', }, } - - -class WDRMausIE(InfoExtractor): - _VALID_URL = 'https?://(?:www\.)?wdrmaus\.de/(?:[^/]+/){,2}(?P[^/?#]+)((?

Sendedatum:\s*([0-9\.]+)

', - webpage, 'air date') - title_str = self._html_search_regex( - r'

(.*?)

', webpage, 'title') - title = '%s - %s' % (title_date, title_str) - upload_date = unified_strdate( - self._html_search_meta('dc.date', webpage)) - - fields = compat_parse_qs(param_code) - video_url = fields['firstVideo'][0] - thumbnail = compat_urlparse.urljoin(url, fields['startPicture'][0]) - - formats = [{ - 'format_id': 'rtmp', - 'url': video_url, - }] - - jscode = self._download_webpage( - 'http://www.wdrmaus.de/codebase/js/extended-medien.min.js', - video_id, fatal=False, - note='Downloading URL translation table', - errnote='Could not download URL translation table') - if jscode: - for m in re.finditer( - r"stream:\s*'dslSrc=(?P[^']+)',\s*download:\s*'(?P
[^']+)'\s*\}", - jscode): - if video_url.startswith(m.group('stream')): - http_url = video_url.replace( - m.group('stream'), m.group('dl')) - formats.append({ - 'format_id': 'http', - 'url': http_url, - }) - break - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - }