From: Yen Chi Hsuan Date: Sun, 21 Aug 2016 18:11:43 +0000 (+0800) Subject: Merge branch 'TRox1972-charlierose' X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=commitdiff_plain;h=f97f9f71e53cfc4d355be6e989632f2bd67b350d;hp=526656726b13f47a33c36e56821136b90d6decf1;p=youtube-dl Merge branch 'TRox1972-charlierose' --- diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index d4d90c1f8..b0644be11 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1115,7 +1115,4 @@ from .youtube import ( ) from .zapiks import ZapiksIE from .zdf import ZDFIE, ZDFChannelIE -from .zingmp3 import ( - ZingMp3SongIE, - ZingMp3AlbumIE, -) +from .zingmp3 import ZingMp3IE diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index 88bca1007..af7de10b7 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -2,130 +2,75 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_xpath +from ..compat import compat_urlparse from ..utils import ( int_or_none, qualities, unified_strdate, - xpath_attr, - xpath_element, - xpath_text, - xpath_with_ns, ) class FirstTVIE(InfoExtractor): IE_NAME = '1tv' IE_DESC = 'Первый канал' - _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+p?(?P\d+)' + _VALID_URL = r'https?://(?:www\.)?1tv\.ru/(?:[^/]+/)+(?P[^/?#]+)' _TESTS = [{ - # single format via video_materials.json API - 'url': 'http://www.1tv.ru/prj/inprivate/vypusk/35930', - 'md5': '82a2777648acae812d58b3f5bd42882b', + 'url': 'http://www.1tv.ru/shows/naedine-so-vsemi/vypuski/gost-lyudmila-senchina-naedine-so-vsemi-vypusk-ot-12-02-2015', + 'md5': 'a1b6b60d530ebcf8daacf4565762bbaf', 'info_dict': { - 'id': '35930', + 'id': '40049', 'ext': 'mp4', 'title': 'Гость Людмила Сенчина. Наедине со всеми. Выпуск от 12.02.2015', - 'description': 'md5:357933adeede13b202c7c21f91b871b2', + 'description': 'md5:36a39c1d19618fec57d12efe212a8370', 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', 'upload_date': '20150212', 'duration': 2694, }, }, { - # multiple formats via video_materials.json API - 'url': 'http://www.1tv.ru/video_archive/projects/dobroeutro/p113641', - 'info_dict': { - 'id': '113641', - 'ext': 'mp4', - 'title': 'Весенняя аллергия. Доброе утро. Фрагмент выпуска от 07.04.2016', - 'description': 'md5:8dcebb3dded0ff20fade39087fd1fee2', - 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', - 'upload_date': '20160407', - 'duration': 179, - 'formats': 'mincount:3', - }, - 'params': { - 'skip_download': True, - }, - }, { - # single format only available via ONE_ONLINE_VIDEOS.archive_single_xml API - 'url': 'http://www.1tv.ru/video_archive/series/f7552/p47038', - 'md5': '519d306c5b5669761fd8906c39dbee23', - 'info_dict': { - 'id': '47038', - 'ext': 'mp4', - 'title': '"Побег". Второй сезон. 3 серия', - 'description': 'md5:3abf8f6b9bce88201c33e9a3d794a00b', - 'thumbnail': 're:^https?://.*\.(?:jpg|JPG)$', - 'upload_date': '20120516', - 'duration': 3080, - }, - }, { - 'url': 'http://www.1tv.ru/videoarchive/9967', - 'only_matching': True, + 'url': 'http://www.1tv.ru/shows/dobroe-utro/pro-zdorove/vesennyaya-allergiya-dobroe-utro-fragment-vypuska-ot-07042016', + 'only_matching': 'true', }] def _real_extract(self, url): - video_id = self._match_id(url) - - # Videos with multiple formats only available via this API - video = self._download_json( - 'http://www.1tv.ru/video_materials.json?legacy_id=%s' % video_id, - video_id, fatal=False) - - description, thumbnail, upload_date, duration = [None] * 4 - - if video: - item = video[0] - title = item['title'] - quality = qualities(('ld', 'sd', 'hd', )) - formats = [{ - 'url': f['src'], - 'format_id': f.get('name'), - 'quality': quality(f.get('name')), - } for f in item['mbr'] if f.get('src')] - thumbnail = item.get('poster') - else: - # Some videos are not available via video_materials.json - video = self._download_xml( - 'http://www.1tv.ru/owa/win/ONE_ONLINE_VIDEOS.archive_single_xml?pid=%s' % video_id, - video_id) - - NS_MAP = { - 'media': 'http://search.yahoo.com/mrss/', - } + display_id = self._match_id(url) - item = xpath_element(video, './channel/item', fatal=True) - title = xpath_text(item, './title', fatal=True) - formats = [{ - 'url': content.attrib['url'], - } for content in item.findall( - compat_xpath(xpath_with_ns('./media:content', NS_MAP))) if content.attrib.get('url')] - thumbnail = xpath_attr( - item, xpath_with_ns('./media:thumbnail', NS_MAP), 'url') + webpage = self._download_webpage(url, display_id) + playlist_url = compat_urlparse.urljoin(url, self._search_regex( + r'data-playlist-url="([^"]+)', webpage, 'playlist url')) + item = self._download_json(playlist_url, display_id)[0] + video_id = item['id'] + quality = qualities(('ld', 'sd', 'hd', )) + formats = [] + for f in item.get('mbr', []): + src = f.get('src') + if not src: + continue + fname = f.get('name') + formats.append({ + 'url': src, + 'format_id': fname, + 'quality': quality(fname), + }) self._sort_formats(formats) - webpage = self._download_webpage(url, video_id, 'Downloading page', fatal=False) - if webpage: - title = self._html_search_regex( - (r'
\s*

([^<]*)', - r"'title'\s*:\s*'([^']+)'"), - webpage, 'title', default=None) or title - description = self._html_search_regex( - r'
\s*
 
\s*

([^<]*)

', - webpage, 'description', default=None) or self._html_search_meta( - 'description', webpage, 'description') - thumbnail = thumbnail or self._og_search_thumbnail(webpage) - duration = int_or_none(self._html_search_meta( - 'video:duration', webpage, 'video duration', fatal=False)) - upload_date = unified_strdate(self._html_search_meta( - 'ya:ovs:upload_date', webpage, 'upload date', fatal=False)) + title = self._html_search_regex( + (r'
\s*

([^<]*)', + r"'title'\s*:\s*'([^']+)'"), + webpage, 'title', default=None) or item['title'] + description = self._html_search_regex( + r'
\s*
 
\s*

([^<]*)

', + webpage, 'description', default=None) or self._html_search_meta( + 'description', webpage, 'description') + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'video duration', fatal=False)) + upload_date = unified_strdate(self._html_search_meta( + 'ya:ovs:upload_date', webpage, 'upload date', fatal=False)) return { 'id': video_id, - 'thumbnail': thumbnail, + 'thumbnail': item.get('poster') or self._og_search_thumbnail(webpage), 'title': title, 'description': description, 'upload_date': upload_date, diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index e0f7366c2..6a8464998 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -36,6 +36,12 @@ class KalturaIE(InfoExtractor): ''' _SERVICE_URL = 'http://cdnapi.kaltura.com' _SERVICE_BASE = '/api_v3/index.php' + # See https://github.com/kaltura/server/blob/master/plugins/content/caption/base/lib/model/enums/CaptionType.php + _CAPTION_TYPES = { + 1: 'srt', + 2: 'ttml', + 3: 'vtt', + } _TESTS = [ { 'url': 'kaltura:269692:1_1jc2y3e4', @@ -285,9 +291,12 @@ class KalturaIE(InfoExtractor): # Continue if caption is not ready if f.get('status') != 2: continue + if not caption.get('id'): + continue + caption_format = int_or_none(caption.get('format')) subtitles.setdefault(caption.get('languageCode') or caption.get('language'), []).append({ 'url': '%s/api_v3/service/caption_captionasset/action/serve/captionAssetId/%s' % (self._SERVICE_URL, caption['id']), - 'ext': caption.get('fileExt', 'ttml'), + 'ext': caption.get('fileExt') or self._CAPTION_TYPES.get(caption_format) or 'ttml', }) return { diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 890f55180..359a8859c 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -7,6 +7,7 @@ import random from .common import InfoExtractor from ..compat import ( + compat_HTTPError, compat_parse_qs, compat_str, compat_urllib_parse_urlencode, @@ -14,13 +15,13 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + clean_html, ExtractorError, int_or_none, js_to_json, orderedSet, parse_duration, parse_iso8601, - sanitized_Request, urlencode_postdata, ) @@ -42,7 +43,7 @@ class TwitchBaseIE(InfoExtractor): '%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')), expected=True) - def _download_json(self, url, video_id, note='Downloading JSON metadata'): + def _call_api(self, path, item_id, note): headers = { 'Referer': 'http://api.twitch.tv/crossdomain/receiver.html?v=2', 'X-Requested-With': 'XMLHttpRequest', @@ -50,8 +51,8 @@ class TwitchBaseIE(InfoExtractor): for cookie in self._downloader.cookiejar: if cookie.name == 'api_token': headers['Twitch-Api-Token'] = cookie.value - request = sanitized_Request(url, headers=headers) - response = super(TwitchBaseIE, self)._download_json(request, video_id, note) + response = self._download_json( + '%s/%s' % (self._API_BASE, path), item_id, note) self._handle_error(response) return response @@ -63,9 +64,17 @@ class TwitchBaseIE(InfoExtractor): if username is None: return + def fail(message): + raise ExtractorError( + 'Unable to login. Twitch said: %s' % message, expected=True) + login_page, handle = self._download_webpage_handle( self._LOGIN_URL, None, 'Downloading login page') + # Some TOR nodes and public proxies are blocked completely + if 'blacklist_message' in login_page: + fail(clean_html(login_page)) + login_form = self._hidden_inputs(login_page) login_form.update({ @@ -82,21 +91,24 @@ class TwitchBaseIE(InfoExtractor): if not post_url.startswith('http'): post_url = compat_urlparse.urljoin(redirect_url, post_url) - request = sanitized_Request( - post_url, urlencode_postdata(login_form)) - request.add_header('Referer', redirect_url) - response = self._download_webpage( - request, None, 'Logging in as %s' % username) - - error_message = self._search_regex( - r']+class="subwindow_notice"[^>]*>([^<]+)

', - response, 'error message', default=None) - if error_message: - raise ExtractorError( - 'Unable to login. Twitch said: %s' % error_message, expected=True) + headers = {'Referer': redirect_url} - if '>Reset your password<' in response: - self.report_warning('Twitch asks you to reset your password, go to https://secure.twitch.tv/reset/submit') + try: + response = self._download_json( + post_url, None, 'Logging in as %s' % username, + data=urlencode_postdata(login_form), + headers=headers) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + response = self._parse_json( + e.cause.read().decode('utf-8'), None) + fail(response['message']) + raise + + if response.get('redirect'): + self._download_webpage( + response['redirect'], None, 'Downloading login redirect page', + headers=headers) def _prefer_source(self, formats): try: @@ -109,14 +121,14 @@ class TwitchBaseIE(InfoExtractor): class TwitchItemBaseIE(TwitchBaseIE): def _download_info(self, item, item_id): - return self._extract_info(self._download_json( - '%s/kraken/videos/%s%s' % (self._API_BASE, item, item_id), item_id, + return self._extract_info(self._call_api( + 'kraken/videos/%s%s' % (item, item_id), item_id, 'Downloading %s info JSON' % self._ITEM_TYPE)) def _extract_media(self, item_id): info = self._download_info(self._ITEM_SHORTCUT, item_id) - response = self._download_json( - '%s/api/videos/%s%s' % (self._API_BASE, self._ITEM_SHORTCUT, item_id), item_id, + response = self._call_api( + 'api/videos/%s%s' % (self._ITEM_SHORTCUT, item_id), item_id, 'Downloading %s playlist JSON' % self._ITEM_TYPE) entries = [] chunks = response['chunks'] @@ -246,8 +258,8 @@ class TwitchVodIE(TwitchItemBaseIE): item_id = self._match_id(url) info = self._download_info(self._ITEM_SHORTCUT, item_id) - access_token = self._download_json( - '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id, + access_token = self._call_api( + 'api/vods/%s/access_token' % item_id, item_id, 'Downloading %s access token' % self._ITEM_TYPE) formats = self._extract_m3u8_formats( @@ -275,12 +287,12 @@ class TwitchVodIE(TwitchItemBaseIE): class TwitchPlaylistBaseIE(TwitchBaseIE): - _PLAYLIST_URL = '%s/kraken/channels/%%s/videos/?offset=%%d&limit=%%d' % TwitchBaseIE._API_BASE + _PLAYLIST_PATH = 'kraken/channels/%s/videos/?offset=%d&limit=%d' _PAGE_LIMIT = 100 def _extract_playlist(self, channel_id): - info = self._download_json( - '%s/kraken/channels/%s' % (self._API_BASE, channel_id), + info = self._call_api( + 'kraken/channels/%s' % channel_id, channel_id, 'Downloading channel info JSON') channel_name = info.get('display_name') or info.get('name') entries = [] @@ -289,8 +301,8 @@ class TwitchPlaylistBaseIE(TwitchBaseIE): broken_paging_detected = False counter_override = None for counter in itertools.count(1): - response = self._download_json( - self._PLAYLIST_URL % (channel_id, offset, limit), + response = self._call_api( + self._PLAYLIST_PATH % (channel_id, offset, limit), channel_id, 'Downloading %s videos JSON page %s' % (self._PLAYLIST_TYPE, counter_override or counter)) @@ -345,7 +357,7 @@ class TwitchProfileIE(TwitchPlaylistBaseIE): class TwitchPastBroadcastsIE(TwitchPlaylistBaseIE): IE_NAME = 'twitch:past_broadcasts' _VALID_URL = r'%s/(?P[^/]+)/profile/past_broadcasts/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE - _PLAYLIST_URL = TwitchPlaylistBaseIE._PLAYLIST_URL + '&broadcasts=true' + _PLAYLIST_PATH = TwitchPlaylistBaseIE._PLAYLIST_PATH + '&broadcasts=true' _PLAYLIST_TYPE = 'past broadcasts' _TEST = { @@ -389,8 +401,8 @@ class TwitchStreamIE(TwitchBaseIE): def _real_extract(self, url): channel_id = self._match_id(url) - stream = self._download_json( - '%s/kraken/streams/%s' % (self._API_BASE, channel_id), channel_id, + stream = self._call_api( + 'kraken/streams/%s' % channel_id, channel_id, 'Downloading stream JSON').get('stream') # Fallback on profile extraction if stream is offline @@ -405,8 +417,8 @@ class TwitchStreamIE(TwitchBaseIE): # JSON and fallback to lowercase if it's not available. channel_id = stream.get('channel', {}).get('name') or channel_id.lower() - access_token = self._download_json( - '%s/api/channels/%s/access_token' % (self._API_BASE, channel_id), channel_id, + access_token = self._call_api( + 'api/channels/%s/access_token' % channel_id, channel_id, 'Downloading channel access token') query = { diff --git a/youtube_dl/extractor/zingmp3.py b/youtube_dl/extractor/zingmp3.py index 437eecb67..bd708b42c 100644 --- a/youtube_dl/extractor/zingmp3.py +++ b/youtube_dl/extractor/zingmp3.py @@ -4,13 +4,17 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + int_or_none, + update_url_query, +) class ZingMp3BaseInfoExtractor(InfoExtractor): - def _extract_item(self, item, fatal=True): - error_message = item.find('./errormessage').text + def _extract_item(self, item, page_type, fatal=True): + error_message = item.get('msg') if error_message: if not fatal: return @@ -18,25 +22,48 @@ class ZingMp3BaseInfoExtractor(InfoExtractor): '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) - title = item.find('./title').text.strip() - source = item.find('./source').text - extension = item.attrib['type'] - thumbnail = item.find('./backimage').text + formats = [] + for quality, source_url in zip(item.get('qualities') or item.get('quality', []), item.get('source_list') or item.get('source', [])): + if not source_url or source_url == 'require vip': + continue + if not re.match(r'https?://', source_url): + source_url = '//' + source_url + source_url = self._proto_relative_url(source_url, 'http:') + quality_num = int_or_none(quality) + f = { + 'format_id': quality, + 'url': source_url, + } + if page_type == 'video': + f.update({ + 'height': quality_num, + 'ext': 'mp4', + }) + else: + f.update({ + 'abr': quality_num, + 'ext': 'mp3', + }) + formats.append(f) + + cover = item.get('cover') return { - 'title': title, - 'url': source, - 'ext': extension, - 'thumbnail': thumbnail, + 'title': (item.get('name') or item.get('title')).strip(), + 'formats': formats, + 'thumbnail': 'http:/' + cover if cover else None, + 'artist': item.get('artist'), } - def _extract_player_xml(self, player_xml_url, id, playlist_title=None): - player_xml = self._download_xml(player_xml_url, id, 'Downloading Player XML') - items = player_xml.findall('./item') + def _extract_player_json(self, player_json_url, id, page_type, playlist_title=None): + player_json = self._download_json(player_json_url, id, 'Downloading Player JSON') + items = player_json['data'] + if 'item' in items: + items = items['item'] if len(items) == 1: # one single song - data = self._extract_item(items[0]) + data = self._extract_item(items[0], page_type) data['id'] = id return data @@ -45,7 +72,7 @@ class ZingMp3BaseInfoExtractor(InfoExtractor): entries = [] for i, item in enumerate(items, 1): - entry = self._extract_item(item, fatal=False) + entry = self._extract_item(item, page_type, fatal=False) if not entry: continue entry['id'] = '%s-%d' % (id, i) @@ -59,8 +86,8 @@ class ZingMp3BaseInfoExtractor(InfoExtractor): } -class ZingMp3SongIE(ZingMp3BaseInfoExtractor): - _VALID_URL = r'https?://mp3\.zing\.vn/bai-hat/(?P[^/]+)/(?P\w+)\.html' +class ZingMp3IE(ZingMp3BaseInfoExtractor): + _VALID_URL = r'https?://mp3\.zing\.vn/(?:bai-hat|album|playlist|video-clip)/[^/]+/(?P\w+)\.html' _TESTS = [{ 'url': 'http://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', 'md5': 'ead7ae13693b3205cbc89536a077daed', @@ -70,51 +97,47 @@ class ZingMp3SongIE(ZingMp3BaseInfoExtractor): 'ext': 'mp3', 'thumbnail': 're:^https?://.*\.jpg$', }, - }] - IE_NAME = 'zingmp3:song' - IE_DESC = 'mp3.zing.vn songs' - - def _real_extract(self, url): - matched = re.match(self._VALID_URL, url) - slug = matched.group('slug') - song_id = matched.group('song_id') - - webpage = self._download_webpage( - 'http://mp3.zing.vn/bai-hat/%s/%s.html' % (slug, song_id), song_id) - - player_xml_url = self._search_regex( - r'&xmlURL=(?P[^&]+)&', webpage, 'player xml url') - - return self._extract_player_xml(player_xml_url, song_id) - - -class ZingMp3AlbumIE(ZingMp3BaseInfoExtractor): - _VALID_URL = r'https?://mp3\.zing\.vn/(?:album|playlist)/(?P[^/]+)/(?P\w+)\.html' - _TESTS = [{ + }, { + 'url': 'http://mp3.zing.vn/video-clip/Let-It-Go-Frozen-OST-Sungha-Jung/ZW6BAEA0.html', + 'md5': '870295a9cd8045c0e15663565902618d', + 'info_dict': { + 'id': 'ZW6BAEA0', + 'title': 'Let It Go (Frozen OST)', + 'ext': 'mp4', + }, + }, { 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', 'info_dict': { '_type': 'playlist', 'id': 'ZWZBWDAF', - 'title': 'Lâu Đài Tình Ái - Bằng Kiều ft. Minh Tuyết | Album 320 lossless', + 'title': 'Lâu Đài Tình Ái - Bằng Kiều,Minh Tuyết | Album 320 lossless', }, 'playlist_count': 10, + 'skip': 'removed at the request of the owner', }, { 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', 'only_matching': True, }] - IE_NAME = 'zingmp3:album' - IE_DESC = 'mp3.zing.vn albums' + IE_NAME = 'zingmp3' + IE_DESC = 'mp3.zing.vn' def _real_extract(self, url): - matched = re.match(self._VALID_URL, url) - slug = matched.group('slug') - album_id = matched.group('album_id') - - webpage = self._download_webpage( - 'http://mp3.zing.vn/album/%s/%s.html' % (slug, album_id), album_id) - player_xml_url = self._search_regex( - r'&xmlURL=(?P[^&]+)&', webpage, 'player xml url') - - return self._extract_player_xml( - player_xml_url, album_id, - playlist_title=self._og_search_title(webpage)) + page_id = self._match_id(url) + + webpage = self._download_webpage(url, page_id) + + player_json_url = self._search_regex([ + r'data-xml="([^"]+)', + r'&xmlURL=([^&]+)&' + ], webpage, 'player xml url') + + playlist_title = None + page_type = self._search_regex(r'/(?:html5)?xml/([^/-]+)', player_json_url, 'page type') + if page_type == 'video': + player_json_url = update_url_query(player_json_url, {'format': 'json'}) + else: + player_json_url = player_json_url.replace('/xml/', '/html5xml/') + if page_type == 'album': + playlist_title = self._og_search_title(webpage) + + return self._extract_player_json(player_json_url, page_id, page_type, playlist_title)