X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fmediasite.py;h=d6eb1574065dece67e28a4b36fa43478dd48dfa3;hb=a6211d237b4e7051ca018cc09440502561fedaa7;hp=5d281684e7351f4fd88b217137e23067a9b044a6;hpb=8056c8542d008ac2f103b4fcb862dc1ad78dc8fc;p=youtube-dl diff --git a/youtube_dl/extractor/mediasite.py b/youtube_dl/extractor/mediasite.py index 5d281684e..d6eb15740 100644 --- a/youtube_dl/extractor/mediasite.py +++ b/youtube_dl/extractor/mediasite.py @@ -5,21 +5,28 @@ import re import json from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( ExtractorError, - unsmuggle_url, - mimetype2ext, float_or_none, + mimetype2ext, + str_or_none, + try_get, + unescapeHTML, + unsmuggle_url, + url_or_none, + urljoin, ) +_ID_RE = r'(?:[0-9a-f]{32,34}|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12,14})' + + class MediasiteIE(InfoExtractor): - _VALID_URL = r'''(?xi) - https?://[a-z0-9\-\.:\[\]]+/Mediasite/Play/ - (?P[0-9a-f]{32,34}) - (?P\?[^#]+|) - ''' + _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/(?:default|livebroadcast)/Presentation)/(?P%s)(?P\?[^#]+|)' % _ID_RE _TESTS = [ { 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d', @@ -82,72 +89,114 @@ class MediasiteIE(InfoExtractor): 'timestamp': 1333983600, 'duration': 7794, } + }, + { + 'url': 'https://collegerama.tudelft.nl/Mediasite/Showcase/livebroadcast/Presentation/ada7020854f743c49fbb45c9ec7dbb351d', + 'only_matching': True, + }, + { + 'url': 'https://mediasite.ntnu.no/Mediasite/Showcase/default/Presentation/7d8b913259334b688986e970fae6fcb31d', + 'only_matching': True, + }, + { + # dashed id + 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271-681e-4f19-9af3-c60d1f82869b1d', + 'only_matching': True, } ] # look in Mediasite.Core.js (Mediasite.ContentStreamType[*]) _STREAM_TYPES = { - 0: 'video1', # the main video + 0: 'video1', # the main video 2: 'slide', 3: 'presentation', - 4: 'video2', # screencast? + 4: 'video2', # screencast? 5: 'video3', } + @staticmethod + def _extract_urls(webpage): + return [ + unescapeHTML(mobj.group('url')) + for mobj in re.finditer( + r'(?xi)]+\bsrc=(["\'])(?P(?:(?:https?:)?//[^/]+)?/Mediasite/Play/%s(?:\?.*?)?)\1' % _ID_RE, + webpage)] + def _real_extract(self, url): url, data = unsmuggle_url(url, {}) mobj = re.match(self._VALID_URL, url) - ResourceId = mobj.group('id') - QueryString = mobj.group('QueryString') + resource_id = mobj.group('id') + query = mobj.group('query') - webpage = self._download_webpage(url, ResourceId) # XXX: add UrlReferrer? + webpage, urlh = self._download_webpage_handle(url, resource_id) # XXX: add UrlReferrer? + redirect_url = urlh.geturl() # XXX: might have also extracted UrlReferrer and QueryString from the html - ServicePath = compat_urlparse.urljoin(url, self._html_search_regex( - r'
(.+?)
', webpage, ResourceId, + service_path = compat_urlparse.urljoin(redirect_url, self._html_search_regex( + r']+\bid=["\']ServicePath[^>]+>(.+?)', webpage, resource_id, default='/Mediasite/PlayerService/PlayerService.svc/json')) - PlayerOptions = self._download_json( - '%s/GetPlayerOptions' % (ServicePath), ResourceId, + player_options = self._download_json( + '%s/GetPlayerOptions' % service_path, resource_id, headers={ 'Content-type': 'application/json; charset=utf-8', 'X-Requested-With': 'XMLHttpRequest', }, data=json.dumps({ 'getPlayerOptionsRequest': { - 'ResourceId': ResourceId, - 'QueryString': QueryString, + 'ResourceId': resource_id, + 'QueryString': query, 'UrlReferrer': data.get('UrlReferrer', ''), 'UseScreenReader': False, } - }).encode('utf-8')) - Presentation = PlayerOptions['d']['Presentation'] - if Presentation is None: - raise ExtractorError('Mediasite says: %s' % - (PlayerOptions['d']['PlayerPresentationStatusMessage'],), + }).encode('utf-8'))['d'] + + presentation = player_options['Presentation'] + title = presentation['Title'] + + if presentation is None: + raise ExtractorError( + 'Mediasite says: %s' % player_options['PlayerPresentationStatusMessage'], expected=True) thumbnails = [] formats = [] - for snum, Stream in enumerate(Presentation['Streams']): - stream_type = self._STREAM_TYPES.get( - Stream['StreamType'], 'type%u' % Stream['StreamType']) + for snum, Stream in enumerate(presentation['Streams']): + stream_type = Stream.get('StreamType') + if stream_type is None: + continue + + video_urls = Stream.get('VideoUrls') + if not isinstance(video_urls, list): + video_urls = [] + + stream_id = self._STREAM_TYPES.get( + stream_type, 'type%u' % stream_type) stream_formats = [] - for unum, VideoUrl in enumerate(Stream['VideoUrls']): - url = VideoUrl['Location'] + for unum, VideoUrl in enumerate(video_urls): + video_url = url_or_none(VideoUrl.get('Location')) + if not video_url: + continue # XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS - if VideoUrl['MediaType'] == 'SS': + media_type = VideoUrl.get('MediaType') + if media_type == 'SS': stream_formats.extend(self._extract_ism_formats( - url, ResourceId, ism_id='%s-%u.%u' % (stream_type, snum, unum))) - continue - - stream_formats.append({ - 'format_id': '%s-%u.%u' % (stream_type, snum, unum), - 'url': url, - 'ext': mimetype2ext(VideoUrl['MimeType']), - }) + video_url, resource_id, + ism_id='%s-%u.%u' % (stream_id, snum, unum), + fatal=False)) + elif media_type == 'Dash': + stream_formats.extend(self._extract_mpd_formats( + video_url, resource_id, + mpd_id='%s-%u.%u' % (stream_id, snum, unum), + fatal=False)) + else: + stream_formats.append({ + 'format_id': '%s-%u.%u' % (stream_id, snum, unum), + 'url': video_url, + 'ext': mimetype2ext(VideoUrl.get('MimeType')), + }) # TODO: if Stream['HasSlideContent']: # synthesise an MJPEG video stream '%s-%u.slides' % (stream_type, snum) @@ -155,16 +204,16 @@ class MediasiteIE(InfoExtractor): # this will require writing a custom downloader... # disprefer 'secondary' streams - if Stream['StreamType'] != 0: + if stream_type != 0: for fmt in stream_formats: fmt['preference'] = -1 - ThumbnailUrl = Stream.get('ThumbnailUrl') - if ThumbnailUrl: + thumbnail_url = Stream.get('ThumbnailUrl') + if thumbnail_url: thumbnails.append({ - 'id': '%s-%u' % (stream_type, snum), - 'url': compat_urlparse.urljoin(url, ThumbnailUrl), - 'preference': -1 if Stream['StreamType'] != 0 else 0, + 'id': '%s-%u' % (stream_id, snum), + 'url': urljoin(redirect_url, thumbnail_url), + 'preference': -1 if stream_type != 0 else 0, }) formats.extend(stream_formats) @@ -174,11 +223,144 @@ class MediasiteIE(InfoExtractor): # XXX: Presentation['Transcript'] return { - 'id': ResourceId, - 'title': Presentation['Title'], - 'description': Presentation.get('Description'), - 'duration': float_or_none(Presentation.get('Duration'), 1000), - 'timestamp': float_or_none(Presentation.get('UnixTime'), 1000), + 'id': resource_id, + 'title': title, + 'description': presentation.get('Description'), + 'duration': float_or_none(presentation.get('Duration'), 1000), + 'timestamp': float_or_none(presentation.get('UnixTime'), 1000), 'formats': formats, 'thumbnails': thumbnails, } + + +class MediasiteCatalogIE(InfoExtractor): + _VALID_URL = r'''(?xi) + (?Phttps?://[^/]+/Mediasite) + /Catalog/Full/ + (?P{0}) + (?: + /(?P{0}) + /(?P{0}) + )? + '''.format(_ID_RE) + _TESTS = [{ + 'url': 'http://events7.mediasite.com/Mediasite/Catalog/Full/631f9e48530d454381549f955d08c75e21', + 'info_dict': { + 'id': '631f9e48530d454381549f955d08c75e21', + 'title': 'WCET Summit: Adaptive Learning in Higher Ed: Improving Outcomes Dynamically', + }, + 'playlist_count': 6, + 'expected_warnings': ['is not a supported codec'], + }, { + # with CurrentFolderId and RootDynamicFolderId + 'url': 'https://medaudio.medicine.iu.edu/Mediasite/Catalog/Full/9518c4a6c5cf4993b21cbd53e828a92521/97a9db45f7ab47428c77cd2ed74bb98f14/9518c4a6c5cf4993b21cbd53e828a92521', + 'info_dict': { + 'id': '9518c4a6c5cf4993b21cbd53e828a92521', + 'title': 'IUSM Family and Friends Sessions', + }, + 'playlist_count': 2, + }, { + 'url': 'http://uipsyc.mediasite.com/mediasite/Catalog/Full/d5d79287c75243c58c50fef50174ec1b21', + 'only_matching': True, + }, { + # no AntiForgeryToken + 'url': 'https://live.libraries.psu.edu/Mediasite/Catalog/Full/8376d4b24dd1457ea3bfe4cf9163feda21', + 'only_matching': True, + }, { + 'url': 'https://medaudio.medicine.iu.edu/Mediasite/Catalog/Full/9518c4a6c5cf4993b21cbd53e828a92521/97a9db45f7ab47428c77cd2ed74bb98f14/9518c4a6c5cf4993b21cbd53e828a92521', + 'only_matching': True, + }, { + # dashed id + 'url': 'http://events7.mediasite.com/Mediasite/Catalog/Full/631f9e48-530d-4543-8154-9f955d08c75e', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + mediasite_url = mobj.group('url') + catalog_id = mobj.group('catalog_id') + current_folder_id = mobj.group('current_folder_id') or catalog_id + root_dynamic_folder_id = mobj.group('root_dynamic_folder_id') + + webpage = self._download_webpage(url, catalog_id) + + # AntiForgeryToken is optional (e.g. [1]) + # 1. https://live.libraries.psu.edu/Mediasite/Catalog/Full/8376d4b24dd1457ea3bfe4cf9163feda21 + anti_forgery_token = self._search_regex( + r'AntiForgeryToken\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'anti forgery token', default=None, group='value') + if anti_forgery_token: + anti_forgery_header = self._search_regex( + r'AntiForgeryHeaderName\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', + webpage, 'anti forgery header name', + default='X-SOFO-AntiForgeryHeader', group='value') + + data = { + 'IsViewPage': True, + 'IsNewFolder': True, + 'AuthTicket': None, + 'CatalogId': catalog_id, + 'CurrentFolderId': current_folder_id, + 'RootDynamicFolderId': root_dynamic_folder_id, + 'ItemsPerPage': 1000, + 'PageIndex': 0, + 'PermissionMask': 'Execute', + 'CatalogSearchType': 'SearchInFolder', + 'SortBy': 'Date', + 'SortDirection': 'Descending', + 'StartDate': None, + 'EndDate': None, + 'StatusFilterList': None, + 'PreviewKey': None, + 'Tags': [], + } + + headers = { + 'Content-Type': 'application/json; charset=UTF-8', + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + } + if anti_forgery_token: + headers[anti_forgery_header] = anti_forgery_token + + catalog = self._download_json( + '%s/Catalog/Data/GetPresentationsForFolder' % mediasite_url, + catalog_id, data=json.dumps(data).encode(), headers=headers) + + entries = [] + for video in catalog['PresentationDetailsList']: + if not isinstance(video, dict): + continue + video_id = str_or_none(video.get('Id')) + if not video_id: + continue + entries.append(self.url_result( + '%s/Play/%s' % (mediasite_url, video_id), + ie=MediasiteIE.ie_key(), video_id=video_id)) + + title = try_get( + catalog, lambda x: x['CurrentFolder']['Name'], compat_str) + + return self.playlist_result(entries, catalog_id, title,) + + +class MediasiteNamedCatalogIE(InfoExtractor): + _VALID_URL = r'(?xi)(?Phttps?://[^/]+/Mediasite)/Catalog/catalogs/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://msite.misis.ru/Mediasite/Catalog/catalogs/2016-industrial-management-skriabin-o-o', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + mediasite_url = mobj.group('url') + catalog_name = mobj.group('catalog_name') + + webpage = self._download_webpage(url, catalog_name) + + catalog_id = self._search_regex( + r'CatalogId\s*:\s*["\'](%s)' % _ID_RE, webpage, 'catalog id') + + return self.url_result( + '%s/Catalog/Full/%s' % (mediasite_url, catalog_id), + ie=MediasiteCatalogIE.ie_key(), video_id=catalog_id)