X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fgeneric.py;h=a4aef106fb7f1f51ae568f7e6082f97a9eb61558;hb=bd2c211fcc3b0390a1b404fa01409d455859caa8;hp=495fa497544ba4e2a61aa905a69e717b472c003d;hpb=4eec112740910621a7fd9c50158fb2388649d8b7;p=youtube-dl diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 495fa4975..a4aef106f 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -77,11 +77,10 @@ from .instagram import InstagramIE from .liveleak import LiveLeakIE from .threeqsdn import ThreeQSDNIE from .theplatform import ThePlatformIE -from .vessel import VesselIE from .kaltura import KalturaIE from .eagleplatform import EaglePlatformIE from .facebook import FacebookIE -from .soundcloud import SoundcloudIE +from .soundcloud import SoundcloudEmbedIE from .tunein import TuneInBaseIE from .vbox7 import Vbox7IE from .dbtv import DBTVIE @@ -89,10 +88,6 @@ from .piksel import PikselIE from .videa import VideaIE from .twentymin import TwentyMinutenIE from .ustream import UstreamIE -from .openload import ( - OpenloadIE, - VerystreamIE, -) from .videopress import VideoPressIE from .rutube import RutubeIE from .limelight import LimelightBaseIE @@ -119,6 +114,8 @@ from .foxnews import FoxNewsIE from .viqeo import ViqeoIE from .expressen import ExpressenIE from .zype import ZypeIE +from .odnoklassniki import OdnoklassnikiIE +from .kinja import KinjaEmbedIE class GenericIE(InfoExtractor): @@ -1487,16 +1484,18 @@ class GenericIE(InfoExtractor): 'timestamp': 1432570283, }, }, - # OnionStudios embed + # Kinja embed { 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537', 'info_dict': { - 'id': '2855', + 'id': '106351', 'ext': 'mp4', 'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You', + 'description': 'Migrated from OnionStudios', 'thumbnail': r're:^https?://.*\.jpe?g$', - 'uploader': 'ClickHole', - 'uploader_id': 'clickhole', + 'uploader': 'clickhole', + 'upload_date': '20150527', + 'timestamp': 1432744860, } }, # SnagFilms embed @@ -2075,6 +2074,22 @@ class GenericIE(InfoExtractor): }, 'playlist_count': 6, }, + { + # Squarespace video embed, 2019-08-28 + 'url': 'http://ootboxford.com', + 'info_dict': { + 'id': 'Tc7b_JGdZfw', + 'title': 'Out of the Blue, at Childish Things 10', + 'ext': 'mp4', + 'description': 'md5:a83d0026666cf5ee970f8bd1cfd69c7f', + 'uploader_id': 'helendouglashouse', + 'uploader': 'Helen & Douglas House', + 'upload_date': '20140328', + }, + 'params': { + 'skip_download': True, + }, + }, { # Zype embed 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', @@ -2104,6 +2119,23 @@ class GenericIE(InfoExtractor): }, 'expected_warnings': ['Failed to download MPD manifest'], }, + { + # DailyMotion embed with DM.player + 'url': 'https://www.beinsports.com/us/copa-del-rey/video/the-locker-room-valencia-beat-barca-in-copa/1203804', + 'info_dict': { + 'id': 'k6aKkGHd9FJs4mtJN39', + 'ext': 'mp4', + 'title': 'The Locker Room: Valencia Beat Barca In Copa del Rey Final', + 'description': 'This video is private.', + 'uploader_id': 'x1jf30l', + 'uploader': 'beIN SPORTS USA', + 'upload_date': '20190528', + 'timestamp': 1559062971, + }, + 'params': { + 'skip_download': True, + }, + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -2209,7 +2241,7 @@ class GenericIE(InfoExtractor): default_search = 'fixup_error' if default_search in ('auto', 'auto_warning', 'fixup_error'): - if '/' in url: + if re.match(r'^[^\s/]+\.[^\s/]+/', url): self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http') return self.url_result('http://' + url) elif default_search != 'fixup_error': @@ -2378,6 +2410,12 @@ class GenericIE(InfoExtractor): # Unescaping the whole page allows to handle those cases in a generic way webpage = compat_urllib_parse_unquote(webpage) + # Unescape squarespace embeds to be detected by generic extractor, + # see https://github.com/ytdl-org/youtube-dl/issues/21294 + webpage = re.sub( + r']+class=[^>]*?\bsqs-video-wrapper\b[^>]*>', + lambda x: unescapeHTML(x.group(0)), webpage) + # it's tempting to parse this further, but you would # have to take into account all the variations like # Video Title - Site Name @@ -2452,11 +2490,6 @@ class GenericIE(InfoExtractor): if tp_urls: return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform') - # Look for Vessel embeds - vessel_urls = VesselIE._extract_urls(webpage) - if vessel_urls: - return self.playlist_from_matches(vessel_urls, video_id, video_title, ie=VesselIE.ie_key()) - # Look for embedded rtl.nl player matches = re.findall( r']+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"', @@ -2549,11 +2582,11 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url')) # Look for Ooyala videos - mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P[^"&]+)', webpage) or - re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P.{32})[\'"]', webpage) or - re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P.{32})[\'"]', webpage) or - re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P.{32})[\'"]\)', webpage) or - re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P.{32})[\'"]', webpage)) + mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P[^"&]+)', webpage) + or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P.{32})[\'"]', webpage) + or re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P.{32})[\'"]', webpage) + or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P.{32})[\'"]\)', webpage) + or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P.{32})[\'"]', webpage)) if mobj is not None: embed_token = self._search_regex( r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)', @@ -2583,19 +2616,6 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group(1), 'Mpora') - # Look for embedded NovaMov-based player - mobj = re.search( - r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\']) - (?Phttp://(?:(?:embed|www)\.)? - (?:novamov\.com| - nowvideo\.(?:ch|sx|eu|at|ag|co)| - videoweed\.(?:es|com)| - movshare\.(?:net|sx|ag)| - divxstage\.(?:eu|net|ch|co|at|ag)) - /embed\.php.+?)\1''', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - # Look for embedded Facebook player facebook_urls = FacebookIE._extract_urls(webpage) if facebook_urls: @@ -2607,9 +2627,9 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url'), 'VK') # Look for embedded Odnoklassniki player - mobj = re.search(r']+?src=(["\'])(?Phttps?://(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Odnoklassniki') + odnoklassniki_url = OdnoklassnikiIE._extract_url(webpage) + if odnoklassniki_url: + return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key()) # Look for embedded ivi player mobj = re.search(r']+?src=(["\'])(?Phttps?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage) @@ -2728,9 +2748,9 @@ class GenericIE(InfoExtractor): return self.url_result(myvi_url) # Look for embedded soundcloud player - soundcloud_urls = SoundcloudIE._extract_urls(webpage) + soundcloud_urls = SoundcloudEmbedIE._extract_urls(webpage) if soundcloud_urls: - return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) + return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML) # Look for tunein player tunein_urls = TuneInBaseIE._extract_urls(webpage) @@ -2873,6 +2893,12 @@ class GenericIE(InfoExtractor): if senate_isvp_url: return self.url_result(senate_isvp_url, 'SenateISVP') + # Look for Kinja embeds + kinja_embed_urls = KinjaEmbedIE._extract_urls(webpage, url) + if kinja_embed_urls: + return self.playlist_from_matches( + kinja_embed_urls, video_id, video_title) + # Look for OnionStudios embeds onionstudios_url = OnionStudiosIE._extract_url(webpage) if onionstudios_url: @@ -2934,7 +2960,7 @@ class GenericIE(InfoExtractor): # Look for VODPlatform embeds mobj = re.search( - r']+src=(["\'])(?P(?:https?:)?//(?:www\.)?vod-platform\.net/[eE]mbed/.+?)\1', + r']+src=(["\'])(?P(?:https?:)?//(?:(?:www\.)?vod-platform\.net|embed\.kwikmotion\.com)/[eE]mbed/.+?)\1', webpage) if mobj is not None: return self.url_result( @@ -2942,10 +2968,14 @@ class GenericIE(InfoExtractor): # Look for Mangomolo embeds mobj = re.search( - r'''(?x)]+src=(["\'])(?P(?:https?:)?//(?:www\.)?admin\.mangomolo\.com/analytics/index\.php/customers/embed/ + r'''(?x)]+src=(["\'])(?P(?:https?:)?// + (?: + admin\.mangomolo\.com/analytics/index\.php/customers/embed| + player\.mangomolo\.com/v1 + )/ (?: video\?.*?\bid=(?P\d+)| - index\?.*?\bchannelid=(?P(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+) + (?:index|live)\?.*?\bchannelid=(?P(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+) ).+?)\1''', webpage) if mobj is not None: info = { @@ -3014,18 +3044,6 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key()) - # Look for Openload embeds - openload_urls = OpenloadIE._extract_urls(webpage) - if openload_urls: - return self.playlist_from_matches( - openload_urls, video_id, video_title, ie=OpenloadIE.ie_key()) - - # Look for Verystream embeds - verystream_urls = VerystreamIE._extract_urls(webpage) - if verystream_urls: - return self.playlist_from_matches( - verystream_urls, video_id, video_title, ie=VerystreamIE.ie_key()) - # Look for VideoPress embeds videopress_urls = VideoPressIE._extract_urls(webpage) if videopress_urls: @@ -3221,8 +3239,8 @@ class GenericIE(InfoExtractor): else: formats.append({ 'url': src, - 'ext': (mimetype2ext(src_type) or - ext if ext in KNOWN_EXTENSIONS else 'mp4'), + 'ext': (mimetype2ext(src_type) + or ext if ext in KNOWN_EXTENSIONS else 'mp4'), }) if formats: self._sort_formats(formats)