From: remitamine Date: Mon, 28 Dec 2015 17:17:12 +0000 (+0100) Subject: Merge pull request #8023 from remitamine/extract-formats X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=commitdiff_plain;h=54537cdfb3da7a64967f07df86042ffca761d937;hp=8d29e47f543152bf91db0167a313e56ea2f132e3;p=youtube-dl Merge pull request #8023 from remitamine/extract-formats [common] simplify the use of _extract_m3u8_formats and _extract_f4m_formats --- diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 165835f63..971047ad4 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -135,7 +135,12 @@ from .dailymotion import ( ) from .daum import DaumIE from .dbtv import DBTVIE -from .dcn import DCNIE +from .dcn import ( + DCNIE, + DCNVideoIE, + DCNLiveIE, + DCNSeasonIE, +) from .dctp import DctpTvIE from .deezer import DeezerPlaylistIE from .democracynow import DemocracynowIE @@ -703,7 +708,13 @@ from .tube8 import Tube8IE from .tubitv import TubiTvIE from .tudou import TudouIE from .tumblr import TumblrIE -from .tunein import TuneInIE +from .tunein import ( + TuneInClipIE, + TuneInStationIE, + TuneInProgramIE, + TuneInTopicIE, + TuneInShortenerIE, +) from .turbo import TurboIE from .tutv import TutvIE from .tv2 import ( diff --git a/youtube_dl/extractor/abc7news.py b/youtube_dl/extractor/abc7news.py index c04949c21..122dc9099 100644 --- a/youtube_dl/extractor/abc7news.py +++ b/youtube_dl/extractor/abc7news.py @@ -44,7 +44,6 @@ class Abc7NewsIE(InfoExtractor): 'contentURL', webpage, 'm3u8 url', fatal=True) formats = self._extract_m3u8_formats(m3u8, display_id, 'mp4') - self._sort_formats(formats) title = self._og_search_title(webpage).strip() description = self._og_search_description(webpage).strip() diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 7b685d157..b3ee67018 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -58,18 +58,23 @@ class CSpanIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + video_type = None webpage = self._download_webpage(url, video_id) - matches = re.search(r'data-(prog|clip)id=\'([0-9]+)\'', webpage) - if matches: + # We first look for clipid, because clipprog always appears before + patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')] + results = list(filter(None, (re.search(p, webpage) for p in patterns))) + if results: + matches = results[0] video_type, video_id = matches.groups() - if video_type == 'prog': - video_type = 'program' + video_type = 'clip' if video_type == 'id' else 'program' else: senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) if senate_isvp_url: title = self._og_search_title(webpage) surl = smuggle_url(senate_isvp_url, {'force_title': title}) return self.url_result(surl, 'SenateISVP', video_id, title) + if video_type is None or video_id is None: + raise ExtractorError('unable to find video id and type') def get_text_attr(d, attr): return d.get(attr, {}).get('#text') diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index 9737cff14..0d140f12f 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -1,26 +1,89 @@ # coding: utf-8 from __future__ import unicode_literals +import re +import base64 + from .common import InfoExtractor from ..compat import compat_urllib_parse from ..utils import ( int_or_none, parse_iso8601, sanitized_Request, + smuggle_url, + unsmuggle_url, ) class DCNIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/.+|show/\d+/.+?)/(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?show/(?P\d+)/[^/]+(?:/(?P\d+)/(?P\d+))?' + + def _real_extract(self, url): + show_id, video_id, season_id = re.match(self._VALID_URL, url).groups() + if video_id and int(video_id) > 0: + return self.url_result( + 'http://www.dcndigital.ae/media/%s' % video_id, 'DCNVideo') + elif season_id and int(season_id) > 0: + return self.url_result(smuggle_url( + 'http://www.dcndigital.ae/program/season/%s' % season_id, + {'show_id': show_id}), 'DCNSeason') + else: + return self.url_result( + 'http://www.dcndigital.ae/program/%s' % show_id, 'DCNSeason') + + +class DCNBaseIE(InfoExtractor): + def _extract_video_info(self, video_data, video_id, is_live): + title = video_data.get('title_en') or video_data['title_ar'] + img = video_data.get('img') + thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None + duration = int_or_none(video_data.get('duration')) + description = video_data.get('description_en') or video_data.get('description_ar') + timestamp = parse_iso8601(video_data.get('create_time'), ' ') + + return { + 'id': video_id, + 'title': self._live_title(title) if is_live else title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'is_live': is_live, + } + + def _extract_video_formats(self, webpage, video_id, entry_protocol): + formats = [] + m3u8_url = self._html_search_regex( + r'file\s*:\s*"([^"]+)', webpage, 'm3u8 url', fatal=False) + if m3u8_url: + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol, m3u8_id='hls', fatal=None) + if m3u8_formats: + formats.extend(m3u8_formats) + + rtsp_url = self._search_regex( + r']+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False) + if rtsp_url: + formats.append({ + 'url': rtsp_url, + 'format_id': 'rtsp', + }) + + self._sort_formats(formats) + return formats + + +class DCNVideoIE(DCNBaseIE): + IE_NAME = 'dcn:video' + _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?(?:video/[^/]+|media|catchup/[^/]+/[^/]+)/(?P\d+)' _TEST = { - 'url': 'http://www.dcndigital.ae/#/show/199074/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375/6887', + 'url': 'http://www.dcndigital.ae/#/video/%D8%B1%D8%AD%D9%84%D8%A9-%D8%A7%D9%84%D8%B9%D9%85%D8%B1-%D8%A7%D9%84%D8%AD%D9%84%D9%82%D8%A9-1/17375', 'info_dict': { 'id': '17375', 'ext': 'mp4', 'title': 'رحلة العمر : الحلقة 1', 'description': 'md5:0156e935d870acb8ef0a66d24070c6d6', - 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 2041, 'timestamp': 1227504126, 'upload_date': '20081124', @@ -37,46 +100,95 @@ class DCNIE(InfoExtractor): request = sanitized_Request( 'http://admin.mangomolo.com/analytics/index.php/plus/video?id=%s' % video_id, headers={'Origin': 'http://www.dcndigital.ae'}) - - video = self._download_json(request, video_id) - title = video.get('title_en') or video['title_ar'] + video_data = self._download_json(request, video_id) + info = self._extract_video_info(video_data, video_id, False) webpage = self._download_webpage( 'http://admin.mangomolo.com/analytics/index.php/customers/embed/video?' + compat_urllib_parse.urlencode({ - 'id': video['id'], - 'user_id': video['user_id'], - 'signature': video['signature'], + 'id': video_data['id'], + 'user_id': video_data['user_id'], + 'signature': video_data['signature'], 'countries': 'Q0M=', 'filter': 'DENY', }), video_id) + info['formats'] = self._extract_video_formats(webpage, video_id, 'm3u8_native') + return info - m3u8_url = self._html_search_regex(r'file:\s*"([^"]+)', webpage, 'm3u8 url') - formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') - rtsp_url = self._search_regex( - r']+href="(rtsp://[^"]+)"', webpage, 'rtsp url', fatal=False) - if rtsp_url: - formats.append({ - 'url': rtsp_url, - 'format_id': 'rtsp', +class DCNLiveIE(DCNBaseIE): + IE_NAME = 'dcn:live' + _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?live/(?P\d+)' + + def _real_extract(self, url): + channel_id = self._match_id(url) + + request = sanitized_Request( + 'http://admin.mangomolo.com/analytics/index.php/plus/getchanneldetails?channel_id=%s' % channel_id, + headers={'Origin': 'http://www.dcndigital.ae'}) + + channel_data = self._download_json(request, channel_id) + info = self._extract_video_info(channel_data, channel_id, True) + + webpage = self._download_webpage( + 'http://admin.mangomolo.com/analytics/index.php/customers/embed/index?' + + compat_urllib_parse.urlencode({ + 'id': base64.b64encode(channel_data['user_id'].encode()).decode(), + 'channelid': base64.b64encode(channel_data['id'].encode()).decode(), + 'signature': channel_data['signature'], + 'countries': 'Q0M=', + 'filter': 'DENY', + }), channel_id) + info['formats'] = self._extract_video_formats(webpage, channel_id, 'm3u8') + return info + + +class DCNSeasonIE(InfoExtractor): + IE_NAME = 'dcn:season' + _VALID_URL = r'https?://(?:www\.)?dcndigital\.ae/(?:#/)?program/(?:(?P\d+)|season/(?P\d+))' + _TEST = { + 'url': 'http://dcndigital.ae/#/program/205024/%D9%85%D8%AD%D8%A7%D8%B6%D8%B1%D8%A7%D8%AA-%D8%A7%D9%84%D8%B4%D9%8A%D8%AE-%D8%A7%D9%84%D8%B4%D8%B9%D8%B1%D8%A7%D9%88%D9%8A', + 'info_dict': + { + 'id': '7910', + 'title': 'محاضرات الشيخ الشعراوي', + }, + 'playlist_mincount': 27, + } + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + show_id, season_id = re.match(self._VALID_URL, url).groups() + + data = {} + if season_id: + data['season'] = season_id + show_id = smuggled_data.get('show_id') + if show_id is None: + request = sanitized_Request( + 'http://admin.mangomolo.com/analytics/index.php/plus/season_info?id=%s' % season_id, + headers={'Origin': 'http://www.dcndigital.ae'}) + season = self._download_json(request, season_id) + show_id = season['id'] + data['show_id'] = show_id + request = sanitized_Request( + 'http://admin.mangomolo.com/analytics/index.php/plus/show', + compat_urllib_parse.urlencode(data), + { + 'Origin': 'http://www.dcndigital.ae', + 'Content-Type': 'application/x-www-form-urlencoded' }) - self._sort_formats(formats) + show = self._download_json(request, show_id) + if not season_id: + season_id = show['default_season'] + for season in show['seasons']: + if season['id'] == season_id: + title = season.get('title_en') or season['title_ar'] - img = video.get('img') - thumbnail = 'http://admin.mangomolo.com/analytics/%s' % img if img else None - duration = int_or_none(video.get('duration')) - description = video.get('description_en') or video.get('description_ar') - timestamp = parse_iso8601(video.get('create_time') or video.get('update_time'), ' ') + entries = [] + for video in show['videos']: + entries.append(self.url_result( + 'http://www.dcndigital.ae/media/%s' % video['id'], 'DCNVideo')) - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - } + return self.playlist_result(entries, season_id, title) diff --git a/youtube_dl/extractor/esri.py b/youtube_dl/extractor/esri.py index bf5d2019f..d4205d7fb 100644 --- a/youtube_dl/extractor/esri.py +++ b/youtube_dl/extractor/esri.py @@ -61,7 +61,7 @@ class EsriVideoIE(InfoExtractor): webpage, 'duration', fatal=False)) upload_date = unified_strdate(self._html_search_meta( - 'last-modified', webpage, 'upload date', fatal=None)) + 'last-modified', webpage, 'upload date', fatal=False)) return { 'id': video_id, diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 39c481068..5e43f2359 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -74,7 +74,7 @@ class FacebookIE(InfoExtractor): return login_page_req = sanitized_Request(self._LOGIN_URL) - login_page_req.add_header('Cookie', 'locale=en_US') + self._set_cookie('facebook.com', 'locale', 'en_US') login_page = self._download_webpage(login_page_req, None, note='Downloading login page', errnote='Unable to download login page') @@ -100,13 +100,25 @@ class FacebookIE(InfoExtractor): login_results = self._download_webpage(request, None, note='Logging in', errnote='unable to fetch login page') if re.search(r'', login_results) is not None: + error = self._html_search_regex( + r'(?s)]+class=(["\']).*?login_error_box.*?\1[^>]*>]*>.*?]*>(?P.+?)', + login_results, 'login error', default=None, group='error') + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) self._downloader.report_warning('unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.') return + fb_dtsg = self._search_regex( + r'name="fb_dtsg" value="(.+?)"', login_results, 'fb_dtsg', default=None) + h = self._search_regex( + r'name="h"\s+(?:\w+="[^"]+"\s+)*?value="([^"]+)"', login_results, 'h', default=None) + + if not fb_dtsg or not h: + return + check_form = { - 'fb_dtsg': self._search_regex(r'name="fb_dtsg" value="(.+?)"', login_results, 'fb_dtsg'), - 'h': self._search_regex( - r'name="h"\s+(?:\w+="[^"]+"\s+)*?value="([^"]+)"', login_results, 'h'), + 'fb_dtsg': fb_dtsg, + 'h': h, 'name_action_selected': 'dont_save', } check_req = sanitized_Request(self._CHECKPOINT_URL, urlencode_postdata(check_form)) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index c3731a110..66a70a181 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -16,7 +16,7 @@ class IqiyiIE(InfoExtractor): IE_NAME = 'iqiyi' IE_DESC = '爱奇艺' - _VALID_URL = r'http://(?:www\.)iqiyi.com/v_.+?\.html' + _VALID_URL = r'http://(?:[^.]+\.)?iqiyi\.com/.+\.html' _TESTS = [{ 'url': 'http://www.iqiyi.com/v_19rrojlavg.html', @@ -84,6 +84,15 @@ class IqiyiIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'http://www.iqiyi.com/w_19rt6o8t9p.html', + 'only_matching': True, + }, { + 'url': 'http://www.iqiyi.com/a_19rrhbc6kt.html', + 'only_matching': True, + }, { + 'url': 'http://yule.iqiyi.com/pcb.html', + 'only_matching': True, }] _FORMATS_MAP = [ diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index cdc095a79..a92adf2b3 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -44,7 +44,8 @@ class JWPlatformIE(InfoExtractor): source_url = self._proto_relative_url(source['file']) source_type = source.get('type') or '' if source_type == 'application/vnd.apple.mpegurl': - m3u8_formats = self._extract_m3u8_formats(source_url, video_id, 'mp4', 'm3u8_native', fatal=None) + m3u8_formats = self._extract_m3u8_formats( + source_url, video_id, 'mp4', 'm3u8_native', fatal=False) if m3u8_formats: formats.extend(m3u8_formats) elif source_type.startswith('audio'): diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 9c8d826c4..688eb2308 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -254,7 +254,7 @@ class LivestreamOriginalIE(InfoExtractor): 'playlist_mincount': 4, }, { # live stream - 'url': 'http://www.livestream.com/znsbahamas', + 'url': 'http://original.livestream.com/znsbahamas', 'only_matching': True, }] diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py index 46cebc0d7..6ce2ec19d 100644 --- a/youtube_dl/extractor/phoenix.py +++ b/youtube_dl/extractor/phoenix.py @@ -1,10 +1,9 @@ from __future__ import unicode_literals -from .common import InfoExtractor -from .zdf import extract_from_xml_url +from .zdf import ZDFIE -class PhoenixIE(InfoExtractor): +class PhoenixIE(ZDFIE): _VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/ (?: phoenix/die_sendungen/(?:[^/]+/)? @@ -41,5 +40,5 @@ class PhoenixIE(InfoExtractor): r'
[0-9]+) - |tun\.in/(?P[A-Za-z0-9]+) - ) - ''' - _API_URL_TEMPLATE = 'http://tunein.com/tuner/tune/?stationId={0:}&tuneType=Station' - - _INFO_DICT = { - 'id': '34682', - 'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2', - 'ext': 'aac', - 'thumbnail': 're:^https?://.*\.png$', - 'location': 'Tacoma, WA', - } - _TESTS = [ - { - 'url': 'http://tunein.com/radio/Jazz24-885-s34682/', - 'info_dict': _INFO_DICT, - 'params': { - 'skip_download': True, # live stream - }, - }, - { # test redirection - 'url': 'http://tun.in/ser7s', - 'info_dict': _INFO_DICT, - 'params': { - 'skip_download': True, # live stream - }, - }, - ] +class TuneInBaseIE(InfoExtractor): + _API_BASE_URL = 'http://tunein.com/tuner/tune/' def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - redirect_id = mobj.group('redirect_id') - if redirect_id: - # The server doesn't support HEAD requests - urlh = self._request_webpage( - url, redirect_id, note='Downloading redirect page') - url = urlh.geturl() - self.to_screen('Following redirect: %s' % url) - mobj = re.match(self._VALID_URL, url) - station_id = mobj.group('id') - - station_info = self._download_json( - self._API_URL_TEMPLATE.format(station_id), - station_id, note='Downloading station JSON') - - title = station_info['Title'] - thumbnail = station_info.get('Logo') - location = station_info.get('Location') - streams_url = station_info.get('StreamUrl') + content_id = self._match_id(url) + + content_info = self._download_json( + self._API_BASE_URL + self._API_URL_QUERY % content_id, + content_id, note='Downloading JSON metadata') + + title = content_info['Title'] + thumbnail = content_info.get('Logo') + location = content_info.get('Location') + streams_url = content_info.get('StreamUrl') if not streams_url: - raise ExtractorError('No downloadable streams found', - expected=True) + raise ExtractorError('No downloadable streams found', expected=True) + if not streams_url.startswith('http://'): + streams_url = compat_urlparse.urljoin(url, streams_url) + stream_data = self._download_webpage( - streams_url, station_id, note='Downloading stream data') + streams_url, content_id, note='Downloading stream data') streams = json.loads(self._search_regex( r'\((.*)\);', stream_data, 'stream info'))['Streams'] @@ -97,10 +56,122 @@ class TuneInIE(InfoExtractor): self._sort_formats(formats) return { - 'id': station_id, + 'id': content_id, 'title': title, 'formats': formats, 'thumbnail': thumbnail, 'location': location, 'is_live': is_live, } + + +class TuneInClipIE(TuneInBaseIE): + IE_NAME = 'tunein:clip' + _VALID_URL = r'https?://(?:www\.)?tunein\.com/station/.*?audioClipId\=(?P\d+)' + _API_URL_QUERY = '?tuneType=AudioClip&audioclipId=%s' + + _TESTS = [ + { + 'url': 'http://tunein.com/station/?stationId=246119&audioClipId=816', + 'md5': '99f00d772db70efc804385c6b47f4e77', + 'info_dict': { + 'id': '816', + 'title': '32m', + 'ext': 'mp3', + }, + }, + ] + + +class TuneInStationIE(TuneInBaseIE): + IE_NAME = 'tunein:station' + _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-s|station/.*?StationId\=)(?P\d+)' + _API_URL_QUERY = '?tuneType=Station&stationId=%s' + + @classmethod + def suitable(cls, url): + return False if TuneInClipIE.suitable(url) else super(TuneInStationIE, cls).suitable(url) + + _TESTS = [ + { + 'url': 'http://tunein.com/radio/Jazz24-885-s34682/', + 'info_dict': { + 'id': '34682', + 'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2', + 'ext': 'mp3', + 'location': 'Tacoma, WA', + }, + 'params': { + 'skip_download': True, # live stream + }, + }, + ] + + +class TuneInProgramIE(TuneInBaseIE): + IE_NAME = 'tunein:program' + _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-p|program/.*?ProgramId\=)(?P\d+)' + _API_URL_QUERY = '?tuneType=Program&programId=%s' + + _TESTS = [ + { + 'url': 'http://tunein.com/radio/Jazz-24-p2506/', + 'info_dict': { + 'id': '2506', + 'title': 'Jazz 24 on 91.3 WUKY-HD3', + 'ext': 'mp3', + 'location': 'Lexington, KY', + }, + 'params': { + 'skip_download': True, # live stream + }, + }, + ] + + +class TuneInTopicIE(TuneInBaseIE): + IE_NAME = 'tunein:topic' + _VALID_URL = r'https?://(?:www\.)?tunein\.com/topic/.*?TopicId\=(?P\d+)' + _API_URL_QUERY = '?tuneType=Topic&topicId=%s' + + _TESTS = [ + { + 'url': 'http://tunein.com/topic/?TopicId=101830576', + 'md5': 'c31a39e6f988d188252eae7af0ef09c9', + 'info_dict': { + 'id': '101830576', + 'title': 'Votez pour moi du 29 octobre 2015 (29/10/15)', + 'ext': 'mp3', + 'location': 'Belgium', + }, + }, + ] + + +class TuneInShortenerIE(InfoExtractor): + IE_NAME = 'tunein:shortener' + IE_DESC = False # Do not list + _VALID_URL = r'https?://tun\.in/(?P[A-Za-z0-9]+)' + + _TEST = { + # test redirection + 'url': 'http://tun.in/ser7s', + 'info_dict': { + 'id': '34682', + 'title': 'Jazz 24 on 88.5 Jazz24 - KPLU-HD2', + 'ext': 'mp3', + 'location': 'Tacoma, WA', + }, + 'params': { + 'skip_download': True, # live stream + }, + } + + def _real_extract(self, url): + redirect_id = self._match_id(url) + # The server doesn't support HEAD requests + urlh = self._request_webpage( + url, redirect_id, note='Downloading redirect page') + url = urlh.geturl() + self.to_screen('Following redirect: %s' % url) + return self.url_result(url) diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index 811ee197d..129668a99 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -86,9 +86,10 @@ class VGTVIE(XstreamIE): { # streamType: wasLive 'url': 'http://www.vgtv.no/#!/live/113063/direkte-v75-fra-solvalla', + 'md5': '458f4841239dab414343b50e5af8869c', 'info_dict': { 'id': '113063', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'V75 fra Solvalla 30.05.15', 'description': 'md5:b3743425765355855f88e096acc93231', 'thumbnail': 're:^https?://.*\.jpg', @@ -97,10 +98,6 @@ class VGTVIE(XstreamIE): 'upload_date': '20150530', 'view_count': int, }, - 'params': { - # m3u8 download - 'skip_download': True, - }, }, { 'url': 'http://www.aftenposten.no/webtv/#!/video/21039/trailer-sweatshop-i-can-t-take-any-more', @@ -160,12 +157,15 @@ class VGTVIE(XstreamIE): formats.extend(m3u8_formats) hds_url = streams.get('hds') - # wasLive hds are always 404 - if hds_url and stream_type != 'wasLive': + if hds_url: + hdcore_sign = 'hdcore=3.7.0' f4m_formats = self._extract_f4m_formats( - hds_url + '?hdcore=3.2.0&plugin=aasp-3.2.0.77.18', video_id, f4m_id='hds', fatal=False) + hds_url + '?%s' % hdcore_sign, video_id, f4m_id='hds', fatal=False) if f4m_formats: - formats.extend(f4m_formats) + for entry in f4m_formats: + # URLs without the extra param induce an 404 error + entry.update({'extra_param_to_segment_url': hdcore_sign}) + formats.append(entry) mp4_urls = streams.get('pseudostreaming') or [] mp4_url = streams.get('mp4') diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index ca3f20a3d..9a1c377a4 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -279,7 +279,7 @@ class VikiIE(VikiBaseIE): if format_id == 'm3u8': m3u8_formats = self._extract_m3u8_formats( format_dict['url'], video_id, 'mp4', 'm3u8_native', - m3u8_id='m3u8-%s' % protocol, fatal=None) + m3u8_id='m3u8-%s' % protocol, fatal=False) if m3u8_formats: formats.extend(m3u8_formats) else: diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 9a3331a69..92c12bac6 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -10,106 +10,16 @@ from ..utils import ( unified_strdate, OnDemandPagedList, xpath_text, + determine_ext, + qualities, + float_or_none, ) -def extract_from_xml_url(ie, video_id, xml_url): - doc = ie._download_xml( - xml_url, video_id, - note='Downloading video info', - errnote='Failed to download video info') - - title = doc.find('.//information/title').text - description = xpath_text(doc, './/information/detail', 'description') - duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration')) - uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader') - uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id') - upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date')) - - def xml_to_format(fnode): - video_url = fnode.find('url').text - is_available = 'http://www.metafilegenerator' not in video_url - - format_id = fnode.attrib['basetype'] - format_m = re.match(r'''(?x) - (?P[^_]+)_(?P[^_]+)_(?P[^_]+)_ - (?P[^_]+)_(?P[^_]+)_(?P[^_]+) - ''', format_id) - - ext = format_m.group('container') - proto = format_m.group('proto').lower() - - quality = xpath_text(fnode, './quality', 'quality') - abr = int_or_none(xpath_text(fnode, './audioBitrate', 'abr'), 1000) - vbr = int_or_none(xpath_text(fnode, './videoBitrate', 'vbr'), 1000) - - width = int_or_none(xpath_text(fnode, './width', 'width')) - height = int_or_none(xpath_text(fnode, './height', 'height')) - - filesize = int_or_none(xpath_text(fnode, './filesize', 'filesize')) - - format_note = '' - if not format_note: - format_note = None - - return { - 'format_id': format_id + '-' + quality, - 'url': video_url, - 'ext': ext, - 'acodec': format_m.group('acodec'), - 'vcodec': format_m.group('vcodec'), - 'abr': abr, - 'vbr': vbr, - 'width': width, - 'height': height, - 'filesize': filesize, - 'format_note': format_note, - 'protocol': proto, - '_available': is_available, - } - - def xml_to_thumbnails(fnode): - thumbnails = [] - for node in fnode: - thumbnail_url = node.text - if not thumbnail_url: - continue - thumbnail = { - 'url': thumbnail_url, - } - if 'key' in node.attrib: - m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key']) - if m: - thumbnail['width'] = int(m.group(1)) - thumbnail['height'] = int(m.group(2)) - thumbnails.append(thumbnail) - return thumbnails - - thumbnails = xml_to_thumbnails(doc.findall('.//teaserimages/teaserimage')) - - format_nodes = doc.findall('.//formitaeten/formitaet') - formats = list(filter( - lambda f: f['_available'], - map(xml_to_format, format_nodes))) - ie._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'thumbnails': thumbnails, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'upload_date': upload_date, - 'formats': formats, - } - - class ZDFIE(InfoExtractor): _VALID_URL = r'(?:zdf:|zdf:video:|https?://www\.zdf\.de/ZDFmediathek(?:#)?/(.*beitrag/(?:video/)?))(?P[0-9]+)(?:/[^/?]+)?(?:\?.*)?' - _TEST = { + _TESTS = [{ 'url': 'http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt', 'info_dict': { 'id': '2037704', @@ -122,12 +32,163 @@ class ZDFIE(InfoExtractor): 'upload_date': '20131127', }, 'skip': 'Videos on ZDF.de are depublicised in short order', - } + }] + + def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + param_groups = {} + for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)): + group_id = param_group.attrib.get(self._xpath_ns('id', 'http://www.w3.org/XML/1998/namespace')) + params = {} + for param in param_group: + params[param.get('name')] = param.get('value') + param_groups[group_id] = params + + formats = [] + for video in smil.findall(self._xpath_ns('.//video', namespace)): + src = video.get('src') + if not src: + continue + bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + group_id = video.get('paramGroup') + param_group = param_groups[group_id] + for proto in param_group['protocols'].split(','): + formats.append({ + 'url': '%s://%s' % (proto, param_group['host']), + 'app': param_group['app'], + 'play_path': src, + 'ext': 'flv', + 'format_id': '%s-%d' % (proto, bitrate), + 'tbr': bitrate, + 'protocol': proto, + }) + self._sort_formats(formats) + return formats + + def extract_from_xml_url(self, video_id, xml_url): + doc = self._download_xml( + xml_url, video_id, + note='Downloading video info', + errnote='Failed to download video info') + + title = doc.find('.//information/title').text + description = xpath_text(doc, './/information/detail', 'description') + duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration')) + uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader') + uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id') + upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date')) + + def xml_to_thumbnails(fnode): + thumbnails = [] + for node in fnode: + thumbnail_url = node.text + if not thumbnail_url: + continue + thumbnail = { + 'url': thumbnail_url, + } + if 'key' in node.attrib: + m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key']) + if m: + thumbnail['width'] = int(m.group(1)) + thumbnail['height'] = int(m.group(2)) + thumbnails.append(thumbnail) + return thumbnails + + thumbnails = xml_to_thumbnails(doc.findall('.//teaserimages/teaserimage')) + + format_nodes = doc.findall('.//formitaeten/formitaet') + quality = qualities(['veryhigh', 'high', 'med', 'low']) + + def get_quality(elem): + return quality(xpath_text(elem, 'quality')) + format_nodes.sort(key=get_quality) + format_ids = [] + formats = [] + for fnode in format_nodes: + video_url = fnode.find('url').text + is_available = 'http://www.metafilegenerator' not in video_url + if not is_available: + continue + format_id = fnode.attrib['basetype'] + quality = xpath_text(fnode, './quality', 'quality') + format_m = re.match(r'''(?x) + (?P[^_]+)_(?P[^_]+)_(?P[^_]+)_ + (?P[^_]+)_(?P[^_]+)_(?P[^_]+) + ''', format_id) + + ext = determine_ext(video_url, None) or format_m.group('container') + if ext not in ('smil', 'f4m', 'm3u8'): + format_id = format_id + '-' + quality + if format_id in format_ids: + continue + + if ext == 'meta': + continue + elif ext == 'smil': + smil_formats = self._extract_smil_formats( + video_url, video_id, fatal=False) + if smil_formats: + formats.extend(smil_formats) + elif ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + elif ext == 'f4m': + f4m_formats = self._extract_f4m_formats( + video_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + else: + proto = format_m.group('proto').lower() + + abr = int_or_none(xpath_text(fnode, './audioBitrate', 'abr'), 1000) + vbr = int_or_none(xpath_text(fnode, './videoBitrate', 'vbr'), 1000) + + width = int_or_none(xpath_text(fnode, './width', 'width')) + height = int_or_none(xpath_text(fnode, './height', 'height')) + + filesize = int_or_none(xpath_text(fnode, './filesize', 'filesize')) + + format_note = '' + if not format_note: + format_note = None + + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'ext': ext, + 'acodec': format_m.group('acodec'), + 'vcodec': format_m.group('vcodec'), + 'abr': abr, + 'vbr': vbr, + 'width': width, + 'height': height, + 'filesize': filesize, + 'format_note': format_note, + 'protocol': proto, + '_available': is_available, + }) + format_ids.append(format_id) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'thumbnails': thumbnails, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'upload_date': upload_date, + 'formats': formats, + } def _real_extract(self, url): video_id = self._match_id(url) xml_url = 'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id - return extract_from_xml_url(self, video_id, xml_url) + return self.extract_from_xml_url(video_id, xml_url) class ZDFChannelIE(InfoExtractor): diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 1737ac5f6..0ed6c45c8 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -773,11 +773,13 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): raise original_ioerror resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg + del resp.headers['Content-encoding'] # deflate if resp.headers.get('Content-encoding', '') == 'deflate': gz = io.BytesIO(self.deflate(resp.read())) resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg + del resp.headers['Content-encoding'] # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see # https://github.com/rg3/youtube-dl/issues/6457). if 300 <= resp.code < 400: