X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fwdr.py;h=6b83a2a04264e14883c09e98ef59895a520e792b;hb=e1e0a10c567e8457bf83f6b54e65963447e17a8f;hp=1af1e996d8de0e4d9a62bf61f45a13e2a95b7365;hpb=949fc42e009aed5414caad280d0dc551ffcd9c14;p=youtube-dl diff --git a/youtube_dl/extractor/wdr.py b/youtube_dl/extractor/wdr.py index 1af1e996d..6b83a2a04 100644 --- a/youtube_dl/extractor/wdr.py +++ b/youtube_dl/extractor/wdr.py @@ -6,22 +6,24 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, + ExtractorError, js_to_json, strip_jsonp, unified_strdate, - ExtractorError, + update_url_query, + urlhandle_detect_ext, ) class WDRIE(InfoExtractor): _CURRENT_MAUS_URL = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/){1,2}[^/?#]+\.php5' - _PAGE_REGEX = r'/mediathek/(?P[^/]+)/(?P[^/]+)/(?P.+)\.html' + _PAGE_REGEX = r'/(?:mediathek/)?[^/]+/(?P[^/]+)/(?P.+)\.html' _VALID_URL = r'(?Phttps?://(?:www\d\.)?wdr\d?\.de)' + _PAGE_REGEX + '|' + _CURRENT_MAUS_URL _TESTS = [ { 'url': 'http://www1.wdr.de/mediathek/video/sendungen/doku-am-freitag/video-geheimnis-aachener-dom-100.html', - 'md5': 'e58c39c3e30077141d258bf588700a7b', + # HDS download, MD5 is unstable 'info_dict': { 'id': 'mdb-1058683', 'ext': 'flv', @@ -32,10 +34,10 @@ class WDRIE(InfoExtractor): 'description': 'md5:87be8ff14d8dfd7a7ee46f0299b52318', 'is_live': False, 'subtitles': {'de': [{ - 'url': 'http://ondemand-ww.wdr.de/medp/fsk0/105/1058683/1058683_12220974.xml' + 'url': 'http://ondemand-ww.wdr.de/medp/fsk0/105/1058683/1058683_12220974.xml', + 'ext': 'ttml', }]}, }, - 'skip': 'Page Not Found', }, { 'url': 'http://www1.wdr.de/mediathek/audio/wdr3/wdr3-gespraech-am-samstag/audio-schriftstellerin-juli-zeh-100.html', @@ -51,7 +53,6 @@ class WDRIE(InfoExtractor): 'is_live': False, 'subtitles': {} }, - 'skip': 'Page Not Found', }, { 'url': 'http://www1.wdr.de/mediathek/video/live/index.html', @@ -72,7 +73,7 @@ class WDRIE(InfoExtractor): }, { 'url': 'http://www1.wdr.de/mediathek/video/sendungen/aktuelle-stunde/aktuelle-stunde-120.html', - 'playlist_mincount': 10, + 'playlist_mincount': 8, 'info_dict': { 'id': 'aktuelle-stunde/aktuelle-stunde-120', }, @@ -90,7 +91,7 @@ class WDRIE(InfoExtractor): }, { 'url': 'http://www.wdrmaus.de/sachgeschichten/sachgeschichten/achterbahn.php5', - 'md5': 'ca365705551e4bd5217490f3b0591290', + # HDS download, MD5 is unstable 'info_dict': { 'id': 'mdb-186083', 'ext': 'flv', @@ -98,10 +99,18 @@ class WDRIE(InfoExtractor): 'title': 'Sachgeschichte - Achterbahn ', 'description': '- Die Sendung mit der Maus -', }, - 'params': { - 'skip_download': True, # the file has different versions :( - }, }, + { + 'url': 'http://www1.wdr.de/radio/player/radioplayer116~_layout-popupVersion.html', + # Live stream, MD5 unstable + 'info_dict': { + 'id': 'mdb-869971', + 'ext': 'flv', + 'title': 'Funkhaus Europa Livestream', + 'description': 'md5:2309992a6716c347891c045be50992e4', + 'upload_date': '20160101', + }, + } ] def _real_extract(self, url): @@ -112,16 +121,17 @@ class WDRIE(InfoExtractor): webpage = self._download_webpage(url, display_id) # for wdr.de the data-extension is in a tag with the class "mediaLink" + # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" # for wdrmaus its in a link to the page in a multiline "videoLink"-tag json_metadata = self._html_search_regex( - r'class=(?:"mediaLink\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"', + r'class=(?:"(?:mediaLink|wdrrPlayerPlayBtn)\b[^"]*"[^>]+|"videoLink\b[^"]*"[\s]*>\n[^\n]*)data-extension="([^"]+)"', webpage, 'media link', default=None, flags=re.MULTILINE) if not json_metadata: entries = [ self.url_result(page_url + href[0], 'WDR') for href in re.findall( - r']+data-extension=' % self._PAGE_REGEX, webpage) ] @@ -143,35 +153,46 @@ class WDRIE(InfoExtractor): formats = [] # check if the metadata contains a direct URL to a file - metadata_media_alt = metadata_media_resource.get('alt') - if metadata_media_alt: - for tag_name in ['videoURL', 'audioURL']: - if tag_name in metadata_media_alt: - alt_url = metadata_media_alt[tag_name] - if determine_ext(alt_url) == 'm3u8': - m3u_fmt = self._extract_m3u8_formats( - alt_url, display_id, 'mp4', 'm3u8_native', - m3u8_id='hls') - formats.extend(m3u_fmt) - else: - formats.append({ - 'url': alt_url - }) - - # check if there are flash-streams for this video - if 'dflt' in metadata_media_resource and 'videoURL' in metadata_media_resource['dflt']: - video_url = metadata_media_resource['dflt']['videoURL'] - if video_url.endswith('.f4m'): - full_video_url = video_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18' - formats.extend(self._extract_f4m_formats(full_video_url, display_id, f4m_id='hds', fatal=False)) - elif video_url.endswith('.smil'): - formats.extend(self._extract_smil_formats(video_url, 'stream', fatal=False)) + for kind, media_resource in metadata_media_resource.items(): + if kind not in ('dflt', 'alt'): + continue + + for tag_name, medium_url in media_resource.items(): + if tag_name not in ('videoURL', 'audioURL'): + continue + + ext = determine_ext(medium_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + medium_url, display_id, 'mp4', 'm3u8_native', + m3u8_id='hls')) + elif ext == 'f4m': + manifest_url = update_url_query( + medium_url, {'hdcore': '3.2.0', 'plugin': 'aasp-3.2.0.77.18'}) + formats.extend(self._extract_f4m_formats( + manifest_url, display_id, f4m_id='hds', fatal=False)) + elif ext == 'smil': + formats.extend(self._extract_smil_formats( + medium_url, 'stream', fatal=False)) + else: + a_format = { + 'url': medium_url + } + if ext == 'unknown_video': + urlh = self._request_webpage( + medium_url, display_id, note='Determining extension') + ext = urlhandle_detect_ext(urlh) + a_format['ext'] = ext + formats.append(a_format) + + self._sort_formats(formats) subtitles = {} caption_url = metadata_media_resource.get('captionURL') if caption_url: subtitles['de'] = [{ - 'url': caption_url + 'url': caption_url, + 'ext': 'ttml', }] title = metadata_tracker_data.get('trackerClipTitle') @@ -188,8 +209,6 @@ class WDRIE(InfoExtractor): if upload_date: upload_date = unified_strdate(upload_date) - self._sort_formats(formats) - return { 'id': metadata_tracker_data.get('trackerClipId', display_id), 'display_id': display_id,