X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcondenast.py;h=ed278fefc67089f44760ed8e0635cf1c5cbb1fde;hb=HEAD;hp=f336a3c620a04e8bb643309b4812725e8f50e1d1;hpb=6d69d03bac08f8381031de721167103697bf3fed;p=youtube-dl diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index f336a3c62..ed278fefc 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -1,15 +1,22 @@ # coding: utf-8 +from __future__ import unicode_literals import re -import json from .common import InfoExtractor -from ..utils import ( - compat_urllib_parse, - orderedSet, +from ..compat import ( compat_urllib_parse_urlparse, compat_urlparse, ) +from ..utils import ( + determine_ext, + extract_attributes, + int_or_none, + js_to_json, + mimetype2ext, + orderedSet, + parse_iso8601, +) class CondeNastIE(InfoExtractor): @@ -20,87 +27,206 @@ class CondeNastIE(InfoExtractor): # The keys are the supported sites and the values are the name to be shown # to the user and in the extractor description. - _SITES = {'wired': u'WIRED', - 'gq': u'GQ', - 'vogue': u'Vogue', - 'glamour': u'Glamour', - 'wmagazine': u'W Magazine', - 'vanityfair': u'Vanity Fair', - } - - _VALID_URL = r'http://(video|www).(?P%s).com/(?Pwatch|series|video)/(?P.+)' % '|'.join(_SITES.keys()) - IE_DESC = u'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) - - _TEST = { - u'url': u'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', - u'file': u'5171b343c2b4c00dd0c1ccb3.mp4', - u'md5': u'1921f713ed48aabd715691f774c451f7', - u'info_dict': { - u'title': u'3D Printed Speakers Lit With LED', - u'description': u'Check out these beautiful 3D printed LED speakers. You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.', - } + _SITES = { + 'allure': 'Allure', + 'architecturaldigest': 'Architectural Digest', + 'arstechnica': 'Ars Technica', + 'bonappetit': 'Bon Appétit', + 'brides': 'Brides', + 'cnevids': 'Condé Nast', + 'cntraveler': 'Condé Nast Traveler', + 'details': 'Details', + 'epicurious': 'Epicurious', + 'glamour': 'Glamour', + 'golfdigest': 'Golf Digest', + 'gq': 'GQ', + 'newyorker': 'The New Yorker', + 'self': 'SELF', + 'teenvogue': 'Teen Vogue', + 'vanityfair': 'Vanity Fair', + 'vogue': 'Vogue', + 'wired': 'WIRED', + 'wmagazine': 'W Magazine', } + _VALID_URL = r'''(?x)https?://(?:video|www|player(?:-backend)?)\.(?:%s)\.com/ + (?: + (?: + embed(?:js)?| + (?:script|inline)/video + )/(?P[0-9a-f]{24})(?:/(?P[0-9a-f]{24}))?(?:.+?\btarget=(?P[^&]+))?| + (?Pwatch|series|video)/(?P[^/?#]+) + )''' % '|'.join(_SITES.keys()) + IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) + + EMBED_URL = r'(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?' % '|'.join(_SITES.keys()) + + _TESTS = [{ + 'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', + 'md5': '1921f713ed48aabd715691f774c451f7', + 'info_dict': { + 'id': '5171b343c2b4c00dd0c1ccb3', + 'ext': 'mp4', + 'title': '3D Printed Speakers Lit With LED', + 'description': 'Check out these beautiful 3D printed LED speakers. You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.', + 'uploader': 'wired', + 'upload_date': '20130314', + 'timestamp': 1363219200, + } + }, { + 'url': 'http://video.gq.com/watch/the-closer-with-keith-olbermann-the-only-true-surprise-trump-s-an-idiot?c=series', + 'info_dict': { + 'id': '58d1865bfd2e6126e2000015', + 'ext': 'mp4', + 'title': 'The Only True Surprise? Trump’s an Idiot', + 'uploader': 'gq', + 'upload_date': '20170321', + 'timestamp': 1490126427, + }, + }, { + # JS embed + 'url': 'http://player.cnevids.com/embedjs/55f9cf8b61646d1acf00000c/5511d76261646d5566020000.js', + 'md5': 'f1a6f9cafb7083bab74a710f65d08999', + 'info_dict': { + 'id': '55f9cf8b61646d1acf00000c', + 'ext': 'mp4', + 'title': '3D printed TSA Travel Sentry keys really do open TSA locks', + 'uploader': 'arstechnica', + 'upload_date': '20150916', + 'timestamp': 1442434955, + } + }, { + 'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player', + 'only_matching': True, + }, { + 'url': 'http://player-backend.cnevids.com/script/video/59138decb57ac36b83000005.js', + 'only_matching': True, + }] + def _extract_series(self, url, webpage): - title = self._html_search_regex(r'
.*?

(.+?)

', - webpage, u'series title', flags=re.DOTALL) + title = self._html_search_regex( + r'(?s)
.*?

(.+?)

', + webpage, 'series title') url_object = compat_urllib_parse_urlparse(url) base_url = '%s://%s' % (url_object.scheme, url_object.netloc) - m_paths = re.finditer(r'

.*?.*?(.+?)

', - r'
(.+?)
', - ], - webpage, u'description', - fatal=False, flags=re.DOTALL) - params = self._search_regex(r'var params = {(.+?)}[;,]', webpage, - u'player params', flags=re.DOTALL) - video_id = self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, u'video id') - player_id = self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, u'player id') - target = self._search_regex(r'target: [\'"](.+?)[\'"]', params, u'target') - data = compat_urllib_parse.urlencode({'videoId': video_id, - 'playerId': player_id, - 'target': target, - }) - base_info_url = self._search_regex(r'url = [\'"](.+?)[\'"][,;]', - webpage, u'base info url', - default='http://player.cnevids.com/player/loader.js?') - info_url = base_info_url + data - info_page = self._download_webpage(info_url, video_id, - u'Downloading video info') - video_info = self._search_regex(r'var video = ({.+?});', info_page, u'video info') - video_info = json.loads(video_info) - - def _formats_sort_key(f): - type_ord = 1 if f['type'] == 'video/mp4' else 0 - quality_ord = 1 if f['quality'] == 'high' else 0 - return (quality_ord, type_ord) - best_format = sorted(video_info['sources'][0], key=_formats_sort_key)[-1] - - return {'id': video_id, - 'url': best_format['src'], - 'ext': best_format['type'].split('/')[-1], - 'title': video_info['title'], - 'thumbnail': video_info['poster_frame'], - 'description': description, - } + def _extract_video_params(self, webpage, display_id): + query = self._parse_json( + self._search_regex( + r'(?s)var\s+params\s*=\s*({.+?})[;,]', webpage, 'player params', + default='{}'), + display_id, transform_source=js_to_json, fatal=False) + if query: + query['videoId'] = self._search_regex( + r'(?:data-video-id=|currentVideoId\s*=\s*)["\']([\da-f]+)', + webpage, 'video id', default=None) + else: + params = extract_attributes(self._search_regex( + r'(<[^>]+data-js="video-player"[^>]+>)', + webpage, 'player params element')) + query.update({ + 'videoId': params['data-video'], + 'playerId': params['data-player'], + 'target': params['id'], + }) + return query + + def _extract_video(self, params): + video_id = params['videoId'] + + video_info = None + + # New API path + query = params.copy() + query['embedType'] = 'inline' + info_page = self._download_json( + 'http://player.cnevids.com/embed-api.json', video_id, + 'Downloading embed info', fatal=False, query=query) + + # Old fallbacks + if not info_page: + if params.get('playerId'): + info_page = self._download_json( + 'http://player.cnevids.com/player/video.js', video_id, + 'Downloading video info', fatal=False, query=params) + if info_page: + video_info = info_page.get('video') + if not video_info: + info_page = self._download_webpage( + 'http://player.cnevids.com/player/loader.js', + video_id, 'Downloading loader info', query=params) + if not video_info: + info_page = self._download_webpage( + 'https://player.cnevids.com/inline/video/%s.js' % video_id, + video_id, 'Downloading inline info', query={ + 'target': params.get('target', 'embedplayer') + }) + + if not video_info: + video_info = self._parse_json( + self._search_regex( + r'(?s)var\s+config\s*=\s*({.+?});', info_page, 'config'), + video_id, transform_source=js_to_json)['video'] + + title = video_info['title'] + + formats = [] + for fdata in video_info['sources']: + src = fdata.get('src') + if not src: + continue + ext = mimetype2ext(fdata.get('type')) or determine_ext(src) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + quality = fdata.get('quality') + formats.append({ + 'format_id': ext + ('-%s' % quality if quality else ''), + 'url': src, + 'ext': ext, + 'quality': 1 if quality == 'high' else 0, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'thumbnail': video_info.get('poster_frame'), + 'uploader': video_info.get('brand'), + 'duration': int_or_none(video_info.get('duration')), + 'tags': video_info.get('tags'), + 'series': video_info.get('series_title'), + 'season': video_info.get('season_title'), + 'timestamp': parse_iso8601(video_info.get('premiere_date')), + 'categories': video_info.get('categories'), + } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - site = mobj.group('site') - url_type = mobj.group('type') - id = mobj.group('id') + video_id, player_id, target, url_type, display_id = re.match(self._VALID_URL, url).groups() + + if video_id: + return self._extract_video({ + 'videoId': video_id, + 'playerId': player_id, + 'target': target, + }) - self.to_screen(u'Extracting from %s with the Condé Nast extractor' % self._SITES[site]) - webpage = self._download_webpage(url, id) + webpage = self._download_webpage(url, display_id) if url_type == 'series': return self._extract_series(url, webpage) else: - return self._extract_video(webpage) + params = self._extract_video_params(webpage, display_id) + info = self._search_json_ld( + webpage, display_id, fatal=False) + info.update(self._extract_video(params)) + return info