We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 1f8dd9c commit 174c475Copy full SHA for 174c475
3 files changed
include/xsimd/arch/xsimd_avx.hpp
@@ -1067,14 +1067,10 @@ namespace xsimd
1067
template <class A>
1068
XSIMD_INLINE double reduce_add(batch<double, A> const& rhs, requires_arch<avx>) noexcept
1069
{
1070
- // rhs = (x0, x1, x2, x3)
1071
- // tmp = (x2, x3, x0, x1)
1072
- __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1);
1073
- // tmp = (x2+x0, x3+x1, -, -)
1074
- tmp = _mm256_add_pd(rhs, tmp);
1075
- // tmp = (x2+x0+x3+x1, -, -, -)
1076
- tmp = _mm256_hadd_pd(tmp, tmp);
1077
- return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0));
+ __m128d low, high;
+ detail::split_avx(rhs, low, high);
+ batch<double, sse4_2> blow(low), bhigh(high);
+ return reduce_add(blow+bhigh, sse4_2 {});
1078
}
1079
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
1080
XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
include/xsimd/arch/xsimd_avx512f.hpp
@@ -1410,10 +1410,14 @@ namespace xsimd
1410
1411
XSIMD_INLINE double reduce_add(batch<double, A> const& rhs, requires_arch<avx512f>) noexcept
1412
1413
+#if defined(__INTEL_COMPILER)
1414
+ return _mm512_reduce_add_pd(rhs);
1415
+#else
1416
__m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1);
1417
__m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0);
1418
__m256d res1 = _mm256_add_pd(tmp1, tmp2);
1419
return reduce_add(batch<double, avx2>(res1), avx2 {});
1420
+#endif
1421
1422
1423
XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<avx512f>) noexcept
include/xsimd/arch/xsimd_sse3.hpp
@@ -53,8 +53,9 @@ namespace xsimd
53
54
XSIMD_INLINE double reduce_add(batch<double, A> const& self, requires_arch<sse3>) noexcept
55
56
- __m128d tmp0 = _mm_hadd_pd(self, self);
57
- return _mm_cvtsd_f64(tmp0);
+ double low = _mm_cvtsd_f64(self); // get lower element
+ double high = _mm_cvtsd_f64(_mm_shuffle_pd(self, self, 0x1)); // get upper element
58
+ return low + high;
59
60
61
0 commit comments