X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fslideshare.py;h=e89ebebe7567ebb0abae4a15b21b100339c64b44;hb=HEAD;hp=afc3001b57f404486e2fa3a9c911bf4eec663b46;hpb=94518f208799dde250f5fd8dd0ce77dd7ea05b1c;p=youtube-dl diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py index afc3001b5..e89ebebe7 100644 --- a/youtube_dl/extractor/slideshare.py +++ b/youtube_dl/extractor/slideshare.py @@ -1,22 +1,28 @@ +from __future__ import unicode_literals + import re import json from .common import InfoExtractor -from ..utils import ( +from ..compat import ( compat_urlparse, +) +from ..utils import ( ExtractorError, + get_element_by_id, ) class SlideshareIE(InfoExtractor): - _VALID_URL = r'https?://www\.slideshare\.net/[^/]+?/(?P.+?)($|\?)' + _VALID_URL = r'https?://(?:www\.)?slideshare\.net/[^/]+?/(?P<title>.+?)($|\?)' _TEST = { - u'url': u'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity', - u'file': u'25665706.mp4', - u'info_dict': { - u'title': u'Managing Scale and Complexity', - u'description': u'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix', + 'url': 'http://www.slideshare.net/Dataversity/keynote-presentation-managing-scale-and-complexity', + 'info_dict': { + 'id': '25665706', + 'ext': 'mp4', + 'title': 'Managing Scale and Complexity', + 'description': 'This was a keynote presentation at the NoSQL Now! 2013 Conference & Expo (http://www.nosqlnow.com). This presentation was given by Adrian Cockcroft from Netflix.', }, } @@ -25,16 +31,19 @@ class SlideshareIE(InfoExtractor): page_title = mobj.group('title') webpage = self._download_webpage(url, page_title) slideshare_obj = self._search_regex( - r'var slideshare_object = ({.*?}); var user_info =', - webpage, u'slideshare object') + r'\$\.extend\(.*?slideshare_object,\s*(\{.*?\})\);', + webpage, 'slideshare object') info = json.loads(slideshare_obj) - if info['slideshow']['type'] != u'video': - raise ExtractorError(u'Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True) + if info['slideshow']['type'] != 'video': + raise ExtractorError('Webpage type is "%s": only video extraction is supported for Slideshare' % info['slideshow']['type'], expected=True) doc = info['doc'] bucket = info['jsplayer']['video_bucket'] ext = info['jsplayer']['video_extension'] video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext) + description = get_element_by_id('slideshow-description-paragraph', webpage) or self._html_search_regex( + r'(?s)<p[^>]+itemprop="description"[^>]*>(.+?)</p>', webpage, + 'description', fatal=False) return { '_type': 'video', @@ -43,5 +52,5 @@ class SlideshareIE(InfoExtractor): 'ext': ext, 'url': video_url, 'thumbnail': info['slideshow']['pin_image_url'], - 'description': self._og_search_description(webpage), + 'description': description.strip() if description else None, }