From 7ed599237daa4ecb30c9a5f001a7ecca0b61e30f Mon Sep 17 00:00:00 2001
From: Matt Corallo <git@bluematt.me>
Date: Fri, 15 Dec 2023 23:39:57 +0000
Subject: [PATCH] Draft NEON that might work

---
 lightning/src/util/simd_f32.rs | 93 ++++++++++++++++++++++++++++++----
 1 file changed, 83 insertions(+), 10 deletions(-)
diff --git a/lightning/src/util/simd_f32.rs b/lightning/src/util/simd_f32.rs
index 04d40363f..fbc3e7951 100644
--- a/lightning/src/util/simd_f32.rs
+++ b/lightning/src/util/simd_f32.rs
@@ -25,28 +25,28 @@ mod non_simd {
 			(self.3, self.2, self.1, self.0)
 		}
 	}
-	impl std::ops::Div<FourF32> for FourF32 {
+	impl core::ops::Div<FourF32> for FourF32 {
 		type Output = FourF32;
 		#[inline(always)]
 		fn div(self, o: FourF32) -> Self {
 			Self(self.0 / o.0, self.1 / o.1, self.2 / o.2, self.3 / o.3)
 		}
 	}
-	impl std::ops::Mul<FourF32> for FourF32 {
+	impl core::ops::Mul<FourF32> for FourF32 {
 		type Output = FourF32;
 		#[inline(always)]
 		fn mul(self, o: FourF32) -> Self {
 			Self(self.0 * o.0, self.1 * o.1, self.2 * o.2, self.3 * o.3)
 		}
 	}
-	impl std::ops::Add<FourF32> for FourF32 {
+	impl core::ops::Add<FourF32> for FourF32 {
 		type Output = FourF32;
 		#[inline(always)]
 		fn add(self, o: FourF32) -> Self {
 			Self(self.0 + o.0, self.1 + o.1, self.2 + o.2, self.3 + o.3)
 		}
 	}
-	impl std::ops::Sub<FourF32> for FourF32 {
+	impl core::ops::Sub<FourF32> for FourF32 {
 		type Output = FourF32;
 		#[inline(always)]
 		fn sub(self, o: FourF32) -> Self {
@@ -60,9 +60,9 @@ pub(crate) use non_simd::*;
 #[cfg(target_feature = "sse")]
 mod x86_sse {
 	#[cfg(target_arch = "x86")]
-	use std::arch::x86::*;
+	use core::arch::x86::*;
 	#[cfg(target_arch = "x86_64")]
-	use std::arch::x86_64::*;
+	use core::arch::x86_64::*;
 
 	#[repr(align(16))]
 	struct AlignedFloats([f32; 4]);
@@ -102,28 +102,28 @@ mod x86_sse {
 			(res.0[3], res.0[2], res.0[1], res.0[0])
 		}
 	}
-	impl std::ops::Div<FourF32> for FourF32 {
+	impl core::ops::Div<FourF32> for FourF32 {
 		type Output = FourF32;
 		#[inline(always)]
 		fn div(self, o: FourF32) -> Self {
 			Self(unsafe { _mm_div_ps(self.0, o.0) })
 		}
 	}
-	impl std::ops::Mul<FourF32> for FourF32 {
+	impl core::ops::Mul<FourF32> for FourF32 {
 		type Output = FourF32;
 		#[inline(always)]
 		fn mul(self, o: FourF32) -> Self {
 			Self(unsafe { _mm_mul_ps(self.0, o.0) })
 		}
 	}
-	impl std::ops::Add<FourF32> for FourF32 {
+	impl core::ops::Add<FourF32> for FourF32 {
 		type Output = FourF32;
 		#[inline(always)]
 		fn add(self, o: FourF32) -> Self {
 			Self(unsafe { _mm_add_ps(self.0, o.0) })
 		}
 	}
-	impl std::ops::Sub<FourF32> for FourF32 {
+	impl core::ops::Sub<FourF32> for FourF32 {
 		type Output = FourF32;
 		#[inline(always)]
 		fn sub(self, o: FourF32) -> Self {
@@ -133,3 +133,76 @@ mod x86_sse {
 }
 #[cfg(target_feature = "sse")]
 pub(crate) use x86_sse::*;
+
+#[cfg(all(target_feature = "neon", target_arch = "aarch64"))]
+mod aarch64_neon {
+	use core::arch::aarch64::*;
+
+	// Not actualy clear if the relevant instructions require alignment, but there's no harm in it
+	// and it may improve performance.
+	#[repr(align(16))]
+	struct AlignedFloats([f32; 4]);
+	#[repr(align(16))]
+	struct AlignedInts([u32; 4]);
+
+	#[derive(Clone, Copy)]
+	pub(crate) struct FourF32(float32x4_t);
+	impl FourF32 {
+		#[inline(always)]
+		pub(crate) fn new(a: f32, b: f32, c: f32, d: f32) -> Self {
+			let data = AlignedFloats([a, b, c, d]);
+			Self(unsafe { vld1q_f32(&data.0[0]) })
+		}
+		#[inline(always)]
+		pub(crate) fn from_ints(a: u16, b: u16, c: u16, d: u16) -> Self {
+			let data = AlignedInts([a as u32, b as u32, c as u32, d as u32]);
+			let ints = unsafe { vld1q_u32(&data.0[0]) };
+			Self(unsafe { vcvtq_f32_u32(ints) })
+		}
+		#[inline(always)]
+		pub(crate) fn hsub(&self) -> Self {
+			let dummy = Self::new(0.0, 0.0, 0.0, 0.0).0; // XXX: There has to be a faster way
+			Self(unsafe { vpaddq_f32(self.0, dummy) })
+		}
+		#[inline(always)]
+		pub(crate) fn consuming_sum(self) -> f32 {
+			unsafe { vaddvq_f32(self.0) }
+		}
+		#[inline(always)]
+		pub(crate) fn dump(self) -> (f32, f32, f32, f32) {
+			let mut res = AlignedFloats([0.0; 4]);
+			unsafe { vst1q_f32(&mut res.0[0], self.0) };
+			(res.0[3], res.0[2], res.0[1], res.0[0])
+		}
+	}
+	impl core::ops::Div<FourF32> for FourF32 {
+		type Output = FourF32;
+		#[inline(always)]
+		fn div(self, o: FourF32) -> Self {
+			Self(unsafe { vdivq_f32(self.0, o.0) })
+		}
+	}
+	impl core::ops::Mul<FourF32> for FourF32 {
+		type Output = FourF32;
+		#[inline(always)]
+		fn mul(self, o: FourF32) -> Self {
+			Self(unsafe { vmulq_f32(self.0, o.0) })
+		}
+	}
+	impl core::ops::Add<FourF32> for FourF32 {
+		type Output = FourF32;
+		#[inline(always)]
+		fn add(self, o: FourF32) -> Self {
+			Self(unsafe { vaddq_f32(self.0, o.0) })
+		}
+	}
+	impl core::ops::Sub<FourF32> for FourF32 {
+		type Output = FourF32;
+		#[inline(always)]
+		fn sub(self, o: FourF32) -> Self {
+			Self(unsafe { vsubq_f32(self.0, o.0) })
+		}
+	}
+}
+#[cfg(all(target_feature = "neon", target_arch = "aarch64"))]
+pub(crate) use aarch64_neon::*;
-- 
2.39.5