From: remitamine Date: Sat, 12 Mar 2016 16:28:54 +0000 (+0100) Subject: Merge pull request #8827 from remitamine/safari X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=commitdiff_plain;h=0ec589fac376539edafe48b9d712a5a920ba5f34;hp=bcb668de189b92bd1dcad661d50ae981f25deca4;p=youtube-dl Merge pull request #8827 from remitamine/safari [safari] extract free and preview videos(#7491) --- diff --git a/AUTHORS b/AUTHORS index b6b47ac57..aa48cd5a6 100644 --- a/AUTHORS +++ b/AUTHORS @@ -162,3 +162,4 @@ Robin Houtevelts Patrick Griffis Aidan Rowe mutantmonkey +Ben Congdon diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 98de5ddff..c5ca01ee7 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -54,6 +54,7 @@ from .arte import ( from .atresplayer import AtresPlayerIE from .atttechchannel import ATTTechChannelIE from .audimedia import AudiMediaIE +from .audioboom import AudioBoomIE from .audiomack import AudiomackIE, AudiomackAlbumIE from .azubu import AzubuIE, AzubuLiveIE from .baidu import BaiduVideoIE @@ -188,6 +189,10 @@ from .dumpert import DumpertIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE from .dropbox import DropboxIE +from .dw import ( + DWIE, + DWArticleIE, +) from .eagleplatform import EaglePlatformIE from .ebaumsworld import EbaumsWorldIE from .echomsk import EchoMskIE diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py new file mode 100644 index 000000000..2ec2d7092 --- /dev/null +++ b/youtube_dl/extractor/audioboom.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import float_or_none + + +class AudioBoomIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?audioboom\.com/boos/(?P[0-9]+)' + _TEST = { + 'url': 'https://audioboom.com/boos/4279833-3-09-2016-czaban-hour-3?t=0', + 'md5': '63a8d73a055c6ed0f1e51921a10a5a76', + 'info_dict': { + 'id': '4279833', + 'ext': 'mp3', + 'title': '3/09/2016 Czaban Hour 3', + 'description': 'Guest: Nate Davis - NFL free agency, Guest: Stan Gans', + 'duration': 2245.72, + 'uploader': 'Steve Czaban', + 'uploader_url': 're:https?://(?:www\.)?audioboom\.com/channel/steveczabanyahoosportsradio', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + clip = None + + clip_store = self._parse_json( + self._search_regex( + r'data-new-clip-store=(["\'])(?P{.*?"clipId"\s*:\s*%s.*?})\1' % video_id, + webpage, 'clip store', default='{}', group='json'), + video_id, fatal=False) + if clip_store: + clips = clip_store.get('clips') + if clips and isinstance(clips, list) and isinstance(clips[0], dict): + clip = clips[0] + + def from_clip(field): + if clip: + clip.get(field) + + audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property( + 'audio', webpage, 'audio url') + title = from_clip('title') or self._og_search_title(webpage) + description = from_clip('description') or self._og_search_description(webpage) + + duration = float_or_none(from_clip('duration') or self._html_search_meta( + 'weibo:audio:duration', webpage)) + + uploader = from_clip('author') or self._og_search_property( + 'audio:artist', webpage, 'uploader', fatal=False) + uploader_url = from_clip('author_url') or self._html_search_meta( + 'audioboo:channel', webpage, 'uploader url') + + return { + 'id': video_id, + 'url': audio_url, + 'title': title, + 'description': description, + 'duration': duration, + 'uploader': uploader, + 'uploader_url': uploader_url, + } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index bfa9c82f6..0b8b906ab 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -48,6 +48,7 @@ from ..utils import ( determine_protocol, parse_duration, mimetype2ext, + update_url_query, ) @@ -345,7 +346,7 @@ class InfoExtractor(object): def IE_NAME(self): return compat_str(type(self).__name__[:-2]) - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None): """ Returns the response handle """ if note is None: self.report_download_webpage(video_id) @@ -354,6 +355,12 @@ class InfoExtractor(object): self.to_screen('%s' % (note,)) else: self.to_screen('%s: %s' % (video_id, note)) + # data, headers and query params will be ignored for `Request` objects + if isinstance(url_or_request, compat_str): + if query: + url_or_request = update_url_query(url_or_request, query) + if data or headers: + url_or_request = sanitized_Request(url_or_request, data, headers or {}) try: return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: @@ -369,13 +376,13 @@ class InfoExtractor(object): self._downloader.report_warning(errmsg) return False - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None): + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers=None, query=None): """ Returns a tuple (page content as string, URL handle) """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] - urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal) + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query) if urlh is False: assert not fatal return False @@ -462,13 +469,13 @@ class InfoExtractor(object): return content - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers=None, query=None): """ Returns the data of the page as a string """ success = False try_count = 0 while success is False: try: - res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding) + res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query) success = True except compat_http_client.IncompleteRead as e: try_count += 1 @@ -483,10 +490,10 @@ class InfoExtractor(object): def _download_xml(self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None): + transform_source=None, fatal=True, encoding=None, data=None, headers=None, query=None): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage( - url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding) + url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) if xml_string is False: return xml_string if transform_source: @@ -497,10 +504,10 @@ class InfoExtractor(object): note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None): + fatal=True, encoding=None, data=None, headers=None, query=None): json_string = self._download_webpage( url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding) + encoding=encoding, data=data, headers=headers, query=query) if (not fatal) and json_string is False: return None return self._parse_json( @@ -1140,8 +1147,8 @@ class InfoExtractor(object): out.append('{%s}%s' % (namespace, c)) return '/'.join(out) - def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None): - smil = self._download_smil(smil_url, video_id, fatal=fatal) + def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): + smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) if smil is False: assert not fatal @@ -1158,10 +1165,10 @@ class InfoExtractor(object): return {} return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) - def _download_smil(self, smil_url, video_id, fatal=True): + def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None): return self._download_xml( smil_url, video_id, 'Downloading SMIL file', - 'Unable to download SMIL file', fatal=fatal) + 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source) def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): namespace = self._parse_smil_namespace(smil) @@ -1447,8 +1454,9 @@ class InfoExtractor(object): continue representation_attrib = adaptation_set.attrib.copy() representation_attrib.update(representation.attrib) - mime_type = representation_attrib.get('mimeType') - content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType') + # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory + mime_type = representation_attrib['mimeType'] + content_type = mime_type.split('/')[0] if content_type == 'text': # TODO implement WebVTT downloading pass @@ -1471,6 +1479,7 @@ class InfoExtractor(object): f = { 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, 'url': base_url, + 'ext': mimetype2ext(mime_type), 'width': int_or_none(representation_attrib.get('width')), 'height': int_or_none(representation_attrib.get('height')), 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000), diff --git a/youtube_dl/extractor/dw.py b/youtube_dl/extractor/dw.py new file mode 100644 index 000000000..b6c985547 --- /dev/null +++ b/youtube_dl/extractor/dw.py @@ -0,0 +1,85 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none +from ..compat import compat_urlparse + + +class DWIE(InfoExtractor): + IE_NAME = 'dw' + _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+av-(?P\d+)' + _TESTS = [{ + # video + 'url': 'http://www.dw.com/en/intelligent-light/av-19112290', + 'md5': '7372046e1815c5a534b43f3c3c36e6e9', + 'info_dict': { + 'id': '19112290', + 'ext': 'mp4', + 'title': 'Intelligent light', + 'description': 'md5:90e00d5881719f2a6a5827cb74985af1', + 'upload_date': '20160311', + } + }, { + # audio + 'url': 'http://www.dw.com/en/worldlink-my-business/av-19111941', + 'md5': '2814c9a1321c3a51f8a7aeb067a360dd', + 'info_dict': { + 'id': '19111941', + 'ext': 'mp3', + 'title': 'WorldLink: My business', + 'description': 'md5:bc9ca6e4e063361e21c920c53af12405', + 'upload_date': '20160311', + } + }] + + def _real_extract(self, url): + media_id = self._match_id(url) + webpage = self._download_webpage(url, media_id) + hidden_inputs = self._hidden_inputs(webpage) + title = hidden_inputs['media_title'] + + formats = [] + if hidden_inputs.get('player_type') == 'video' and hidden_inputs.get('stream_file') == '1': + formats = self._extract_smil_formats( + 'http://www.dw.com/smil/v-%s' % media_id, media_id, + transform_source=lambda s: s.replace( + 'rtmp://tv-od.dw.de/flash/', + 'http://tv-download.dw.de/dwtv_video/flv/')) + else: + formats = [{'url': hidden_inputs['file_name']}] + + return { + 'id': media_id, + 'title': title, + 'description': self._og_search_description(webpage), + 'thumbnail': hidden_inputs.get('preview_image'), + 'duration': int_or_none(hidden_inputs.get('file_duration')), + 'upload_date': hidden_inputs.get('display_date'), + 'formats': formats, + } + + +class DWArticleIE(InfoExtractor): + IE_NAME = 'dw:article' + _VALID_URL = r'https?://(?:www\.)?dw\.com/(?:[^/]+/)+a-(?P\d+)' + _TEST = { + 'url': 'http://www.dw.com/en/no-hope-limited-options-for-refugees-in-idomeni/a-19111009', + 'md5': '8ca657f9d068bbef74d6fc38b97fc869', + 'info_dict': { + 'id': '19105868', + 'ext': 'mp4', + 'title': 'The harsh life of refugees in Idomeni', + 'description': 'md5:196015cc7e48ebf474db9399420043c7', + 'upload_date': '20160310', + } + } + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + hidden_inputs = self._hidden_inputs(webpage) + media_id = hidden_inputs['media_id'] + media_path = self._search_regex(r'href="([^"]+av-%s)"\s+class="overlayLink"' % media_id, webpage, 'media url') + media_url = compat_urlparse.urljoin(url, media_path) + return self.url_result(media_url, 'DW', media_id) diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 5e8589479..f5bbd39d2 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -38,7 +38,8 @@ class FacebookIE(InfoExtractor): story\.php )\?(?:.*?)(?:v|video_id|story_fbid)=| [^/]+/videos/(?:[^/]+/)?| - [^/]+/posts/ + [^/]+/posts/| + groups/[^/]+/permalink/ )| facebook: ) @@ -123,6 +124,9 @@ class FacebookIE(InfoExtractor): }, { 'url': 'facebook:544765982287235', 'only_matching': True, + }, { + 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/', + 'only_matching': True, }] def _login(self): diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index 37be34091..766fc26d0 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -10,8 +10,8 @@ from ..utils import ( class GoogleDriveIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P[a-zA-Z0-9_-]{28})' - _TEST = { + _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P[a-zA-Z0-9_-]{28,})' + _TESTS = [{ 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', 'md5': '881f7700aec4f538571fa1e0eed4a7b6', 'info_dict': { @@ -20,7 +20,11 @@ class GoogleDriveIE(InfoExtractor): 'title': 'Big Buck Bunny.mp4', 'duration': 46, } - } + }, { + # video id is longer than 28 characters + 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', + 'only_matching': True, + }] _FORMATS_EXT = { '5': 'flv', '6': 'flv', @@ -43,7 +47,7 @@ class GoogleDriveIE(InfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r']+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P[a-zA-Z0-9_-]{28})', + r']+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P[a-zA-Z0-9_-]{28,})', webpage) if mobj: return 'https://drive.google.com/file/d/%s' % mobj.group('id')