X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcondenast.py;h=e8f2b5a07591410c16fe6fe096678a12006abe48;hb=175c2e9ec326f9ef820413837608eb4f5c8c5961;hp=91c1c1348f587798131459676ffe5444727c5c3b;hpb=0f2999fe2b352795d54e6fcc4027e6a64ce5bc1d;p=youtube-dl diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 91c1c1348..e8f2b5a07 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -2,15 +2,17 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor -from ..utils import ( - compat_urllib_parse, - orderedSet, +from ..compat import ( + compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urlparse, ) +from ..utils import ( + orderedSet, + remove_end, +) class CondeNastIE(InfoExtractor): @@ -22,26 +24,51 @@ class CondeNastIE(InfoExtractor): # The keys are the supported sites and the values are the name to be shown # to the user and in the extractor description. _SITES = { - 'wired': 'WIRED', + 'allure': 'Allure', + 'architecturaldigest': 'Architectural Digest', + 'arstechnica': 'Ars Technica', + 'bonappetit': 'Bon Appétit', + 'brides': 'Brides', + 'cnevids': 'Condé Nast', + 'cntraveler': 'Condé Nast Traveler', + 'details': 'Details', + 'epicurious': 'Epicurious', + 'glamour': 'Glamour', + 'golfdigest': 'Golf Digest', 'gq': 'GQ', + 'newyorker': 'The New Yorker', + 'self': 'SELF', + 'teenvogue': 'Teen Vogue', + 'vanityfair': 'Vanity Fair', 'vogue': 'Vogue', - 'glamour': 'Glamour', + 'wired': 'WIRED', 'wmagazine': 'W Magazine', - 'vanityfair': 'Vanity Fair', } - _VALID_URL = r'http://(video|www)\.(?P%s)\.com/(?Pwatch|series|video)/(?P.+)' % '|'.join(_SITES.keys()) + _VALID_URL = r'https?://(?:video|www|player)\.(?P%s)\.com/(?Pwatch|series|video|embed(?:js)?)/(?P[^/?#]+)' % '|'.join(_SITES.keys()) IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) - _TEST = { + EMBED_URL = r'(?:https?:)?//player\.(?P%s)\.com/(?Pembed(?:js)?)/.+?' % '|'.join(_SITES.keys()) + + _TESTS = [{ 'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', - 'file': '5171b343c2b4c00dd0c1ccb3.mp4', 'md5': '1921f713ed48aabd715691f774c451f7', 'info_dict': { + 'id': '5171b343c2b4c00dd0c1ccb3', + 'ext': 'mp4', 'title': '3D Printed Speakers Lit With LED', 'description': 'Check out these beautiful 3D printed LED speakers. You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.', } - } + }, { + # JS embed + 'url': 'http://player.cnevids.com/embedjs/55f9cf8b61646d1acf00000c/5511d76261646d5566020000.js', + 'md5': 'f1a6f9cafb7083bab74a710f65d08999', + 'info_dict': { + 'id': '55f9cf8b61646d1acf00000c', + 'ext': 'mp4', + 'title': '3D printed TSA Travel Sentry keys really do open TSA locks', + } + }] def _extract_series(self, url, webpage): title = self._html_search_regex(r'
.*?

(.+?)

', @@ -55,18 +82,22 @@ class CondeNastIE(InfoExtractor): entries = [self.url_result(build_url(path), 'CondeNast') for path in paths] return self.playlist_result(entries, playlist_title=title) - def _extract_video(self, webpage): - description = self._html_search_regex([r'
(.+?)
', - r'
(.+?)
', - ], - webpage, 'description', - fatal=False, flags=re.DOTALL) + def _extract_video(self, webpage, url_type): + if url_type != 'embed': + description = self._html_search_regex( + [ + r'
(.+?)
', + r'
(.+?)
', + ], + webpage, 'description', fatal=False, flags=re.DOTALL) + else: + description = None params = self._search_regex(r'var params = {(.+?)}[;,]', webpage, 'player params', flags=re.DOTALL) video_id = self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id') player_id = self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, 'player id') target = self._search_regex(r'target: [\'"](.+?)[\'"]', params, 'target') - data = compat_urllib_parse.urlencode({'videoId': video_id, + data = compat_urllib_parse_urlencode({'videoId': video_id, 'playerId': player_id, 'target': target, }) @@ -76,8 +107,8 @@ class CondeNastIE(InfoExtractor): info_url = base_info_url + data info_page = self._download_webpage(info_url, video_id, 'Downloading video info') - video_info = self._search_regex(r'var video = ({.+?});', info_page, 'video info') - video_info = json.loads(video_info) + video_info = self._search_regex(r'var\s+video\s*=\s*({.+?});', info_page, 'video info') + video_info = self._parse_json(video_info, video_id) formats = [{ 'format_id': '%s-%s' % (fdata['type'].split('/')[-1], fdata['quality']), @@ -99,12 +130,19 @@ class CondeNastIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) site = mobj.group('site') url_type = mobj.group('type') - id = mobj.group('id') + item_id = mobj.group('id') + + # Convert JS embed to regular embed + if url_type == 'embedjs': + parsed_url = compat_urlparse.urlparse(url) + url = compat_urlparse.urlunparse(parsed_url._replace( + path=remove_end(parsed_url.path, '.js').replace('/embedjs/', '/embed/'))) + url_type = 'embed' - self.to_screen(u'Extracting from %s with the Condé Nast extractor' % self._SITES[site]) - webpage = self._download_webpage(url, id) + self.to_screen('Extracting from %s with the Condé Nast extractor' % self._SITES[site]) + webpage = self._download_webpage(url, item_id) if url_type == 'series': return self._extract_series(url, webpage) else: - return self._extract_video(webpage) + return self._extract_video(webpage, url_type)