1 #[cfg(not(target_feature = "sse"))]
4 pub(crate) struct FourF32(f32, f32, f32, f32);
7 pub(crate) fn new(a: f32, b: f32, c: f32, d: f32) -> Self {
11 pub(crate) fn from_ints(a: u16, b: u16, c: u16, d: u16) -> Self {
12 Self(a as f32, b as f32, c as f32, d as f32)
15 pub(crate) fn hsub(&self) -> Self {
16 // _mm_hsub_ps with the second argument zeros
17 Self(self.1 - self.0, self.3 - self.2, 0.0, 0.0)
20 pub(crate) fn consuming_sum(&self) -> f32 {
21 self.0 + self.1 + self.2 + self.3
24 pub(crate) fn dump(self) -> (f32, f32, f32, f32) {
25 (self.3, self.2, self.1, self.0)
28 impl core::ops::Div<FourF32> for FourF32 {
29 type Output = FourF32;
31 fn div(self, o: FourF32) -> Self {
32 Self(self.0 / o.0, self.1 / o.1, self.2 / o.2, self.3 / o.3)
35 impl core::ops::Mul<FourF32> for FourF32 {
36 type Output = FourF32;
38 fn mul(self, o: FourF32) -> Self {
39 Self(self.0 * o.0, self.1 * o.1, self.2 * o.2, self.3 * o.3)
42 impl core::ops::Add<FourF32> for FourF32 {
43 type Output = FourF32;
45 fn add(self, o: FourF32) -> Self {
46 Self(self.0 + o.0, self.1 + o.1, self.2 + o.2, self.3 + o.3)
49 impl core::ops::Sub<FourF32> for FourF32 {
50 type Output = FourF32;
52 fn sub(self, o: FourF32) -> Self {
53 Self(self.0 - o.0, self.1 - o.1, self.2 - o.2, self.3 - o.3)
57 #[cfg(not(target_feature = "sse"))]
58 pub(crate) use non_simd::*;
60 #[cfg(target_feature = "sse")]
62 #[cfg(target_arch = "x86")]
63 use core::arch::x86::*;
64 #[cfg(target_arch = "x86_64")]
65 use core::arch::x86_64::*;
68 struct AlignedFloats([f32; 4]);
70 #[derive(Clone, Copy)]
71 pub(crate) struct FourF32(__m128);
74 pub(crate) fn new(a: f32, b: f32, c: f32, d: f32) -> Self {
75 Self(unsafe { _mm_set_ps(a, b, c, d) })
78 pub(crate) fn from_ints(a: u16, b: u16, c: u16, d: u16) -> Self {
80 let ints =_mm_set_epi32(a as i32, b as i32, c as i32, d as i32);
81 Self(_mm_cvtepi32_ps(ints))
85 pub(crate) fn hsub(&self) -> Self {
86 let dummy = unsafe { _mm_setzero_ps() };
87 Self(unsafe { _mm_hsub_ps(self.0, dummy) })
90 pub(crate) fn consuming_sum(self) -> f32 {
92 let dummy = _mm_setzero_ps();
93 Self(_mm_hadd_ps(self.0, dummy))
99 pub(crate) fn dump(self) -> (f32, f32, f32, f32) {
100 let mut res = AlignedFloats([0.0; 4]);
101 unsafe { _mm_store_ps(&mut res.0[0], self.0) };
102 (res.0[3], res.0[2], res.0[1], res.0[0])
105 impl core::ops::Div<FourF32> for FourF32 {
106 type Output = FourF32;
108 fn div(self, o: FourF32) -> Self {
109 Self(unsafe { _mm_div_ps(self.0, o.0) })
112 impl core::ops::Mul<FourF32> for FourF32 {
113 type Output = FourF32;
115 fn mul(self, o: FourF32) -> Self {
116 Self(unsafe { _mm_mul_ps(self.0, o.0) })
119 impl core::ops::Add<FourF32> for FourF32 {
120 type Output = FourF32;
122 fn add(self, o: FourF32) -> Self {
123 Self(unsafe { _mm_add_ps(self.0, o.0) })
126 impl core::ops::Sub<FourF32> for FourF32 {
127 type Output = FourF32;
129 fn sub(self, o: FourF32) -> Self {
130 Self(unsafe { _mm_sub_ps(self.0, o.0) })
134 #[cfg(target_feature = "sse")]
135 pub(crate) use x86_sse::*;
137 #[cfg(all(target_feature = "neon", target_arch = "aarch64"))]
139 use core::arch::aarch64::*;
141 // Not actualy clear if the relevant instructions require alignment, but there's no harm in it
142 // and it may improve performance.
144 struct AlignedFloats([f32; 4]);
146 struct AlignedInts([u32; 4]);
148 #[derive(Clone, Copy)]
149 pub(crate) struct FourF32(float32x4_t);
152 pub(crate) fn new(a: f32, b: f32, c: f32, d: f32) -> Self {
153 let data = AlignedFloats([a, b, c, d]);
154 Self(unsafe { vld1q_f32(&data.0[0]) })
157 pub(crate) fn from_ints(a: u16, b: u16, c: u16, d: u16) -> Self {
158 let data = AlignedInts([a as u32, b as u32, c as u32, d as u32]);
159 let ints = unsafe { vld1q_u32(&data.0[0]) };
160 Self(unsafe { vcvtq_f32_u32(ints) })
163 pub(crate) fn hsub(&self) -> Self {
164 let dummy = Self::new(0.0, 0.0, 0.0, 0.0).0; // XXX: There has to be a faster way
165 Self(unsafe { vpaddq_f32(self.0, dummy) })
168 pub(crate) fn consuming_sum(self) -> f32 {
169 unsafe { vaddvq_f32(self.0) }
172 pub(crate) fn dump(self) -> (f32, f32, f32, f32) {
173 let mut res = AlignedFloats([0.0; 4]);
174 unsafe { vst1q_f32(&mut res.0[0], self.0) };
175 (res.0[3], res.0[2], res.0[1], res.0[0])
178 impl core::ops::Div<FourF32> for FourF32 {
179 type Output = FourF32;
181 fn div(self, o: FourF32) -> Self {
182 Self(unsafe { vdivq_f32(self.0, o.0) })
185 impl core::ops::Mul<FourF32> for FourF32 {
186 type Output = FourF32;
188 fn mul(self, o: FourF32) -> Self {
189 Self(unsafe { vmulq_f32(self.0, o.0) })
192 impl core::ops::Add<FourF32> for FourF32 {
193 type Output = FourF32;
195 fn add(self, o: FourF32) -> Self {
196 Self(unsafe { vaddq_f32(self.0, o.0) })
199 impl core::ops::Sub<FourF32> for FourF32 {
200 type Output = FourF32;
202 fn sub(self, o: FourF32) -> Self {
203 Self(unsafe { vsubq_f32(self.0, o.0) })
207 #[cfg(all(target_feature = "neon", target_arch = "aarch64"))]
208 pub(crate) use aarch64_neon::*;