Add impl of SIMD'd f32s

author Matt Corallo <git@bluematt.me>

Fri, 15 Dec 2023 02:20:26 +0000 (02:20 +0000)

committer Matt Corallo <git@bluematt.me>

Wed, 17 Jul 2024 14:38:21 +0000 (14:38 +0000)
author Matt Corallo <git@bluematt.me>
Fri, 15 Dec 2023 02:20:26 +0000 (02:20 +0000)
committer Matt Corallo <git@bluematt.me>
Wed, 17 Jul 2024 14:38:21 +0000 (14:38 +0000)
diff --git a/lightning/src/util/mod.rs b/lightning/src/util/mod.rs

index a81a36c5583b713bc491f6556e57999ce681af5e..cc665079c8116dfc0d9234c8cbb5e7cb42ee956c 100644 (file)
--- a/lightning/src/util/mod.rs
+++ b/lightning/src/util/mod.rs
@@ -35,6 +35,7 @@ pub(crate) mod byte_utils;
  pub(crate) mod transaction_utils;
  pub(crate) mod time;
  pub mod hash_tables;
+pub(crate) mod simd_f32;
  
  pub mod indexed_map;
  
diff --git a/lightning/src/util/simd_f32.rs b/lightning/src/util/simd_f32.rs

new file mode 100644 (file)

index 0000000..04d4036
--- /dev/null
+++ b/lightning/src/util/simd_f32.rs
@@ -0,0 +1,135 @@
+#[cfg(not(target_feature = "sse"))]
+mod non_simd {
+       #[derive(Clone, Copy)]
+       pub(crate) struct FourF32(f32, f32, f32, f32);
+       impl FourF32 {
+               #[inline(always)]
+               pub(crate) fn new(a: f32, b: f32, c: f32, d: f32) -> Self {
+                       Self(a, b, c, d)
+               }
+               #[inline(always)]
+               pub(crate) fn from_ints(a: u16, b: u16, c: u16, d: u16) -> Self {
+                       Self(a as f32, b as f32, c as f32, d as f32)
+               }
+               #[inline(always)]
+               pub(crate) fn hsub(&self) -> Self {
+                       // _mm_hsub_ps with the second argument zeros
+                       Self(self.1 - self.0, self.3 - self.2, 0.0, 0.0)
+               }
+               #[inline(always)]
+               pub(crate) fn consuming_sum(&self) -> f32 {
+                       self.0 + self.1 + self.2 + self.3
+               }
+               #[inline(always)]
+               pub(crate) fn dump(self) -> (f32, f32, f32, f32) {
+                       (self.3, self.2, self.1, self.0)
+               }
+       }
+       impl std::ops::Div<FourF32> for FourF32 {
+               type Output = FourF32;
+               #[inline(always)]
+               fn div(self, o: FourF32) -> Self {
+                       Self(self.0 / o.0, self.1 / o.1, self.2 / o.2, self.3 / o.3)
+               }
+       }
+       impl std::ops::Mul<FourF32> for FourF32 {
+               type Output = FourF32;
+               #[inline(always)]
+               fn mul(self, o: FourF32) -> Self {
+                       Self(self.0 * o.0, self.1 * o.1, self.2 * o.2, self.3 * o.3)
+               }
+       }
+       impl std::ops::Add<FourF32> for FourF32 {
+               type Output = FourF32;
+               #[inline(always)]
+               fn add(self, o: FourF32) -> Self {
+                       Self(self.0 + o.0, self.1 + o.1, self.2 + o.2, self.3 + o.3)
+               }
+       }
+       impl std::ops::Sub<FourF32> for FourF32 {
+               type Output = FourF32;
+               #[inline(always)]
+               fn sub(self, o: FourF32) -> Self {
+                       Self(self.0 - o.0, self.1 - o.1, self.2 - o.2, self.3 - o.3)
+               }
+       }
+}
+#[cfg(not(target_feature = "sse"))]
+pub(crate) use non_simd::*;
+
+#[cfg(target_feature = "sse")]
+mod x86_sse {
+       #[cfg(target_arch = "x86")]
+       use std::arch::x86::*;
+       #[cfg(target_arch = "x86_64")]
+       use std::arch::x86_64::*;
+
+       #[repr(align(16))]
+       struct AlignedFloats([f32; 4]);
+
+       #[derive(Clone, Copy)]
+       pub(crate) struct FourF32(__m128);
+       impl FourF32 {
+               #[inline(always)]
+               pub(crate) fn new(a: f32, b: f32, c: f32, d: f32) -> Self {
+                       Self(unsafe { _mm_set_ps(a, b, c, d) })
+               }
+               #[inline(always)]
+               pub(crate) fn from_ints(a: u16, b: u16, c: u16, d: u16) -> Self {
+                       unsafe {
+                               let ints =_mm_set_epi32(a as i32, b as i32, c as i32, d as i32);
+                               Self(_mm_cvtepi32_ps(ints))
+                       }
+               }
+               #[inline(always)]
+               pub(crate) fn hsub(&self) -> Self {
+                       let dummy = unsafe { _mm_setzero_ps() };
+                       Self(unsafe { _mm_hsub_ps(self.0, dummy) })
+               }
+               #[inline(always)]
+               pub(crate) fn consuming_sum(self) -> f32 {
+                       let im = unsafe {
+                               let dummy = _mm_setzero_ps();
+                               Self(_mm_hadd_ps(self.0, dummy))
+                       };
+                       let res = im.dump();
+                       res.2 + res.3
+               }
+               #[inline(always)]
+               pub(crate) fn dump(self) -> (f32, f32, f32, f32) {
+                       let mut res = AlignedFloats([0.0; 4]);
+                       unsafe { _mm_store_ps(&mut res.0[0], self.0) };
+                       (res.0[3], res.0[2], res.0[1], res.0[0])
+               }
+       }
+       impl std::ops::Div<FourF32> for FourF32 {
+               type Output = FourF32;
+               #[inline(always)]
+               fn div(self, o: FourF32) -> Self {
+                       Self(unsafe { _mm_div_ps(self.0, o.0) })
+               }
+       }
+       impl std::ops::Mul<FourF32> for FourF32 {
+               type Output = FourF32;
+               #[inline(always)]
+               fn mul(self, o: FourF32) -> Self {
+                       Self(unsafe { _mm_mul_ps(self.0, o.0) })
+               }
+       }
+       impl std::ops::Add<FourF32> for FourF32 {
+               type Output = FourF32;
+               #[inline(always)]
+               fn add(self, o: FourF32) -> Self {
+                       Self(unsafe { _mm_add_ps(self.0, o.0) })
+               }
+       }
+       impl std::ops::Sub<FourF32> for FourF32 {
+               type Output = FourF32;
+               #[inline(always)]
+               fn sub(self, o: FourF32) -> Self {
+                       Self(unsafe { _mm_sub_ps(self.0, o.0) })
+               }
+       }
+}
+#[cfg(target_feature = "sse")]
+pub(crate) use x86_sse::*;
author	Matt Corallo <git@bluematt.me>
	Fri, 15 Dec 2023 02:20:26 +0000 (02:20 +0000)
committer	Matt Corallo <git@bluematt.me>
	Wed, 17 Jul 2024 14:38:21 +0000 (14:38 +0000)
lightning/src/util/mod.rs		patch \| blob \| history
lightning/src/util/simd_f32.rs	[new file with mode: 0644]	patch \| blob