From 7ed599237daa4ecb30c9a5f001a7ecca0b61e30f Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Fri, 15 Dec 2023 23:39:57 +0000 Subject: [PATCH] Draft NEON that might work --- lightning/src/util/simd_f32.rs | 93 ++++++++++++++++++++++++++++++---- 1 file changed, 83 insertions(+), 10 deletions(-) diff --git a/lightning/src/util/simd_f32.rs b/lightning/src/util/simd_f32.rs index 04d40363f..fbc3e7951 100644 --- a/lightning/src/util/simd_f32.rs +++ b/lightning/src/util/simd_f32.rs @@ -25,28 +25,28 @@ mod non_simd { (self.3, self.2, self.1, self.0) } } - impl std::ops::Div for FourF32 { + impl core::ops::Div for FourF32 { type Output = FourF32; #[inline(always)] fn div(self, o: FourF32) -> Self { Self(self.0 / o.0, self.1 / o.1, self.2 / o.2, self.3 / o.3) } } - impl std::ops::Mul for FourF32 { + impl core::ops::Mul for FourF32 { type Output = FourF32; #[inline(always)] fn mul(self, o: FourF32) -> Self { Self(self.0 * o.0, self.1 * o.1, self.2 * o.2, self.3 * o.3) } } - impl std::ops::Add for FourF32 { + impl core::ops::Add for FourF32 { type Output = FourF32; #[inline(always)] fn add(self, o: FourF32) -> Self { Self(self.0 + o.0, self.1 + o.1, self.2 + o.2, self.3 + o.3) } } - impl std::ops::Sub for FourF32 { + impl core::ops::Sub for FourF32 { type Output = FourF32; #[inline(always)] fn sub(self, o: FourF32) -> Self { @@ -60,9 +60,9 @@ pub(crate) use non_simd::*; #[cfg(target_feature = "sse")] mod x86_sse { #[cfg(target_arch = "x86")] - use std::arch::x86::*; + use core::arch::x86::*; #[cfg(target_arch = "x86_64")] - use std::arch::x86_64::*; + use core::arch::x86_64::*; #[repr(align(16))] struct AlignedFloats([f32; 4]); @@ -102,28 +102,28 @@ mod x86_sse { (res.0[3], res.0[2], res.0[1], res.0[0]) } } - impl std::ops::Div for FourF32 { + impl core::ops::Div for FourF32 { type Output = FourF32; #[inline(always)] fn div(self, o: FourF32) -> Self { Self(unsafe { _mm_div_ps(self.0, o.0) }) } } - impl std::ops::Mul for FourF32 { + impl core::ops::Mul for FourF32 { type Output = FourF32; #[inline(always)] fn mul(self, o: FourF32) -> Self { Self(unsafe { _mm_mul_ps(self.0, o.0) }) } } - impl std::ops::Add for FourF32 { + impl core::ops::Add for FourF32 { type Output = FourF32; #[inline(always)] fn add(self, o: FourF32) -> Self { Self(unsafe { _mm_add_ps(self.0, o.0) }) } } - impl std::ops::Sub for FourF32 { + impl core::ops::Sub for FourF32 { type Output = FourF32; #[inline(always)] fn sub(self, o: FourF32) -> Self { @@ -133,3 +133,76 @@ mod x86_sse { } #[cfg(target_feature = "sse")] pub(crate) use x86_sse::*; + +#[cfg(all(target_feature = "neon", target_arch = "aarch64"))] +mod aarch64_neon { + use core::arch::aarch64::*; + + // Not actualy clear if the relevant instructions require alignment, but there's no harm in it + // and it may improve performance. + #[repr(align(16))] + struct AlignedFloats([f32; 4]); + #[repr(align(16))] + struct AlignedInts([u32; 4]); + + #[derive(Clone, Copy)] + pub(crate) struct FourF32(float32x4_t); + impl FourF32 { + #[inline(always)] + pub(crate) fn new(a: f32, b: f32, c: f32, d: f32) -> Self { + let data = AlignedFloats([a, b, c, d]); + Self(unsafe { vld1q_f32(&data.0[0]) }) + } + #[inline(always)] + pub(crate) fn from_ints(a: u16, b: u16, c: u16, d: u16) -> Self { + let data = AlignedInts([a as u32, b as u32, c as u32, d as u32]); + let ints = unsafe { vld1q_u32(&data.0[0]) }; + Self(unsafe { vcvtq_f32_u32(ints) }) + } + #[inline(always)] + pub(crate) fn hsub(&self) -> Self { + let dummy = Self::new(0.0, 0.0, 0.0, 0.0).0; // XXX: There has to be a faster way + Self(unsafe { vpaddq_f32(self.0, dummy) }) + } + #[inline(always)] + pub(crate) fn consuming_sum(self) -> f32 { + unsafe { vaddvq_f32(self.0) } + } + #[inline(always)] + pub(crate) fn dump(self) -> (f32, f32, f32, f32) { + let mut res = AlignedFloats([0.0; 4]); + unsafe { vst1q_f32(&mut res.0[0], self.0) }; + (res.0[3], res.0[2], res.0[1], res.0[0]) + } + } + impl core::ops::Div for FourF32 { + type Output = FourF32; + #[inline(always)] + fn div(self, o: FourF32) -> Self { + Self(unsafe { vdivq_f32(self.0, o.0) }) + } + } + impl core::ops::Mul for FourF32 { + type Output = FourF32; + #[inline(always)] + fn mul(self, o: FourF32) -> Self { + Self(unsafe { vmulq_f32(self.0, o.0) }) + } + } + impl core::ops::Add for FourF32 { + type Output = FourF32; + #[inline(always)] + fn add(self, o: FourF32) -> Self { + Self(unsafe { vaddq_f32(self.0, o.0) }) + } + } + impl core::ops::Sub for FourF32 { + type Output = FourF32; + #[inline(always)] + fn sub(self, o: FourF32) -> Self { + Self(unsafe { vsubq_f32(self.0, o.0) }) + } + } +} +#[cfg(all(target_feature = "neon", target_arch = "aarch64"))] +pub(crate) use aarch64_neon::*; -- 2.39.5