Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
85 commits
Select commit Hold shift + click to select a range
6e0463d
Add SSE4.2 implementation
AntoinePrv Oct 27, 2025
ebf7d5c
Add unpack uint8_t benchmark
AntoinePrv Oct 28, 2025
043a804
Add bool unpack benchmark
AntoinePrv Oct 30, 2025
2b31e6a
Bias benchmarks toward small scale
AntoinePrv Nov 25, 2025
f48656e
Add Kernel plan builder
AntoinePrv Oct 20, 2025
0c8a178
Add simd kernel
AntoinePrv Oct 24, 2025
0f4f58a
Handle rshifts on SSE2
AntoinePrv Oct 27, 2025
fc923dd
Use new kernel when possible in generated 128 code
AntoinePrv Oct 27, 2025
069c087
Refactor array to xsimd::batch_constant
AntoinePrv Oct 27, 2025
cc061ed
Refactor right shift
AntoinePrv Oct 27, 2025
2ad839f
Add oversized plan
AntoinePrv Oct 28, 2025
f36961d
Add oversized kernel
AntoinePrv Oct 28, 2025
68a4750
Rename kernels
AntoinePrv Oct 28, 2025
2077441
Add simd kernel dispatch
AntoinePrv Oct 28, 2025
a3cc6c2
Call Simd kernel directly
AntoinePrv Oct 28, 2025
a90a872
Fix SIMD level None
AntoinePrv Oct 29, 2025
8638216
Initialize swizzles to -1
AntoinePrv Oct 29, 2025
4edea90
Doc
AntoinePrv Oct 29, 2025
db87718
Improve test error message
AntoinePrv Oct 29, 2025
341cc0e
Use new kernel in avx2
AntoinePrv Oct 28, 2025
d4ccd85
AVX2 swizzle fallback
AntoinePrv Oct 29, 2025
d667c2c
Remove dead code
AntoinePrv Oct 30, 2025
9dfc15c
Simplify Large masks
AntoinePrv Oct 30, 2025
ee70839
Remove bpacking 256 generated file
AntoinePrv Oct 30, 2025
4894f8c
Remove uint8_t fallback
AntoinePrv Oct 30, 2025
218ec04
Add boolean simd implementation
AntoinePrv Oct 30, 2025
d0a0386
Use std::is_base_of for arch detection
AntoinePrv Oct 30, 2025
1dfa82a
Improve swizzle
AntoinePrv Nov 17, 2025
7a26840
Only use lshift hack when available
AntoinePrv Nov 17, 2025
7016645
Fix return type
AntoinePrv Nov 17, 2025
066e2dc
Fix shift included size
AntoinePrv Nov 18, 2025
a55d331
Add Avx2 uint16_t shift fallback
AntoinePrv Nov 19, 2025
e11d127
Refactor make_mult
AntoinePrv Nov 19, 2025
7c89682
Add Avx2 lshift unint8_t fallback
AntoinePrv Nov 19, 2025
6ca1300
Refactor right shift excess
AntoinePrv Nov 19, 2025
f190833
Refactor make_mult
AntoinePrv Nov 20, 2025
114b23d
Add SSE var shift uint8_t fallback to uint16_t
AntoinePrv Nov 20, 2025
b089c04
Implement size reading reduction
AntoinePrv Nov 20, 2025
1e7e14f
Add fallback Avx2 right shift
AntoinePrv Nov 24, 2025
1c8c4f5
Refactor static dispatch
AntoinePrv Nov 26, 2025
e9f4358
Forward oversized to larger uint when possible
AntoinePrv Nov 26, 2025
329c2a3
Add arch detection functions
AntoinePrv Nov 26, 2025
4fe32fa
Refactor traits usage
AntoinePrv Nov 26, 2025
7c9f243
Forward x86_64 unpack64 to unpack32
AntoinePrv Nov 26, 2025
11b09a9
Simplify template usage
AntoinePrv Nov 26, 2025
77bd31a
Reorganize and doc
AntoinePrv Nov 26, 2025
271a215
Refactor KernelDispatch and remove Oversized dispatch
AntoinePrv Nov 26, 2025
1dac3c9
Forward large unpack8 to unpack16 on SSE2
AntoinePrv Nov 26, 2025
1c6ac48
Use fallback right shift on large uint8_t avx2
AntoinePrv Nov 26, 2025
671aaab
Fix enable_if
AntoinePrv Nov 27, 2025
ee71f4b
Add missing header
AntoinePrv Nov 27, 2025
4d6a53f
fmt
AntoinePrv Nov 27, 2025
3856200
Add SSE4.2 to dynamic dispatch
AntoinePrv Nov 27, 2025
eeacb2f
Rename bpacking_simd_impl > bpacking_simd_kernel
AntoinePrv Nov 27, 2025
057551c
Restore modifications to simd_codegen
AntoinePrv Nov 27, 2025
a16dfed
Reduce reading size and declare bytes read
AntoinePrv Nov 27, 2025
83a4c29
Add kBytesRead to scalar code
AntoinePrv Nov 27, 2025
a081b7b
Add kBytesRead to simd 512 generated code
AntoinePrv Nov 27, 2025
6529bfa
Prevent overreading
AntoinePrv Nov 27, 2025
6eb24a7
Fix pessimit overeading guard
AntoinePrv Nov 28, 2025
bb8d9ce
Fix overreading guard comparison
AntoinePrv Dec 1, 2025
4b50d07
Add UnpackOptions and max_read_bytes
AntoinePrv Dec 1, 2025
430a4c5
Use C++20 NTTP
AntoinePrv Jan 7, 2026
498985f
xsimd 14.0 compatibility
AntoinePrv Jan 8, 2026
101577e
fmt
AntoinePrv Jan 21, 2026
db1a2fb
C++20 NTTP options
AntoinePrv Jan 23, 2026
5888738
Homogenous wording
AntoinePrv Jan 23, 2026
a3b26f4
Remove xsimd backward compatibility
AntoinePrv Feb 3, 2026
43fc02f
Apply doc fixes from code review
AntoinePrv Feb 6, 2026
018b876
Documentation and code improvements
AntoinePrv Feb 6, 2026
f6237e0
Move utilities into bpacking sub ns
AntoinePrv Feb 9, 2026
01c506b
Refactor plan builders
AntoinePrv Feb 9, 2026
05b9e8e
Move utilities
AntoinePrv Feb 9, 2026
dcac5cf
Kernel documentation
AntoinePrv Feb 9, 2026
515a733
adjust_bytes_per_read doc
AntoinePrv Feb 9, 2026
1b94a10
Fewer typename
AntoinePrv Feb 9, 2026
fc5d078
Add documentation
AntoinePrv Feb 9, 2026
88d132e
Fix bounds in plan builders
AntoinePrv Feb 10, 2026
fdce6ab
Change names
AntoinePrv Feb 10, 2026
9d8620d
Add extra comments
AntoinePrv Feb 11, 2026
4871519
Fix comments
AntoinePrv Feb 12, 2026
173600c
Old compiler compatibility
AntoinePrv Feb 19, 2026
3a280e4
Initialize cpuid array
AntoinePrv Feb 24, 2026
ab60e96
Disable AVX2
AntoinePrv Feb 26, 2026
8f18e86
Extra checks
AntoinePrv Feb 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion cpp/cmake_modules/SetupCxxFlags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,12 @@ if(ARROW_CPU_FLAG STREQUAL "x86")
if(CMAKE_SIZEOF_VOID_P EQUAL 8)
# Check for AVX extensions on 64-bit systems only, as 32-bit support seems iffy
list(JOIN ARROW_AVX2_FLAGS " " ARROW_AVX2_FLAGS_COMMAND_LINE)
check_cxx_compiler_flag("${ARROW_AVX2_FLAGS_COMMAND_LINE}" CXX_SUPPORTS_AVX2)
if(MINGW AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
message(STATUS "Disable AVX2 support on gcc / MINGW for now")
else()
check_cxx_compiler_flag("${ARROW_AVX2_FLAGS_COMMAND_LINE}" CXX_SUPPORTS_AVX2)
endif()
if(MINGW)
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65782
message(STATUS "Disable AVX512 support on MINGW for now")
Expand Down
16 changes: 11 additions & 5 deletions cpp/src/arrow/util/bit_stream_utils_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,9 @@ inline bool BitWriter::PutValue(uint64_t v, int num_bits) {

if (ARROW_PREDICT_FALSE(static_cast<int64_t>(byte_offset_) * 8 + bit_offset_ +
num_bits >
static_cast<int64_t>(max_bytes_) * 8))
static_cast<int64_t>(max_bytes_) * 8)) {
return false;
}

buffered_values_ |= v << bit_offset_;
bit_offset_ += num_bits;
Expand Down Expand Up @@ -273,14 +274,19 @@ inline int BitReader::GetBatch(int num_bits, T* v, int batch_size) {
batch_size = static_cast<int>(remaining_bits / num_bits);
}

const ::arrow::internal::UnpackOptions opts{
.batch_size = batch_size,
.bit_width = num_bits,
.bit_offset = bit_offset_,
.max_read_bytes = max_bytes_ - byte_offset_,
};

if constexpr (std::is_same_v<T, bool>) {
::arrow::internal::unpack(buffer_ + byte_offset_, v, batch_size, num_bits,
bit_offset_);
::arrow::internal::unpack(buffer_ + byte_offset_, v, opts);

} else {
::arrow::internal::unpack(buffer_ + byte_offset_,
reinterpret_cast<std::make_unsigned_t<T>*>(v), batch_size,
num_bits, bit_offset_);
reinterpret_cast<std::make_unsigned_t<T>*>(v), opts);
}

Advance(batch_size * num_bits);
Expand Down
31 changes: 16 additions & 15 deletions cpp/src/arrow/util/bpacking.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

#include <array>

#include "arrow/util/bpacking_dispatch_internal.h"
#include "arrow/util/bpacking_internal.h"
#include "arrow/util/bpacking_scalar_internal.h"
#include "arrow/util/bpacking_simd_internal.h"
Expand All @@ -29,19 +28,21 @@ namespace {

template <typename Uint>
struct UnpackDynamicFunction {
using FunctionType = decltype(&unpack_scalar<Uint>);
using FunctionType = decltype(&bpacking::unpack_scalar<Uint>);
using Implementation = std::pair<DispatchLevel, FunctionType>;

static constexpr auto implementations() {
return std::array{
// Current SIMD unpack algorithm works terribly on SSE4.2 due to lack of variable
// rhsift and poor xsimd fallback.
Implementation{DispatchLevel::NONE, &unpack_scalar<Uint>},
#if defined(ARROW_HAVE_SSE4_2)
Implementation{DispatchLevel::NONE, &bpacking::unpack_sse4_2<Uint>},
#else
Implementation{DispatchLevel::NONE, &bpacking::unpack_scalar<Uint>},
#endif
#if defined(ARROW_HAVE_RUNTIME_AVX2)
Implementation{DispatchLevel::AVX2, &unpack_avx2<Uint>},
Implementation{DispatchLevel::AVX2, &bpacking::unpack_avx2<Uint>},
#endif
#if defined(ARROW_HAVE_RUNTIME_AVX512)
Implementation{DispatchLevel::AVX512, &unpack_avx512<Uint>},
Implementation{DispatchLevel::AVX512, &bpacking::unpack_avx512<Uint>},
#endif
};
}
Expand All @@ -50,19 +51,19 @@ struct UnpackDynamicFunction {
} // namespace

template <typename Uint>
void unpack(const uint8_t* in, Uint* out, int batch_size, int num_bits, int bit_offset) {
void unpack(const uint8_t* in, Uint* out, const UnpackOptions& opts) {
#if defined(ARROW_HAVE_NEON)
return unpack_neon(in, out, batch_size, num_bits, bit_offset);
return bpacking::unpack_neon(in, out, opts);
#else
static DynamicDispatch<UnpackDynamicFunction<Uint> > dispatch;
return dispatch.func(in, out, batch_size, num_bits, bit_offset);
return dispatch.func(in, out, opts);
#endif
}

template void unpack<bool>(const uint8_t*, bool*, int, int, int);
template void unpack<uint8_t>(const uint8_t*, uint8_t*, int, int, int);
template void unpack<uint16_t>(const uint8_t*, uint16_t*, int, int, int);
template void unpack<uint32_t>(const uint8_t*, uint32_t*, int, int, int);
template void unpack<uint64_t>(const uint8_t*, uint64_t*, int, int, int);
template void unpack<bool>(const uint8_t*, bool*, const UnpackOptions&);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm curious, why not put all the unpack-related APIs inside arrow::internal::bpacking as well? Does it cause too much code churn, or would it fail for other reasons?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No reason, anything works really. My reasoning was unpack is a "library-public" utility function, so it lives in arrow::internal while arrow::internal::bpacking is "private" to the unpack function. Does that makes sense?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Kind of, though we might want to revisit later anyway. Not necessary for this PR in any case!

template void unpack<uint8_t>(const uint8_t*, uint8_t*, const UnpackOptions&);
template void unpack<uint16_t>(const uint8_t*, uint16_t*, const UnpackOptions&);
template void unpack<uint32_t>(const uint8_t*, uint32_t*, const UnpackOptions&);
template void unpack<uint64_t>(const uint8_t*, uint64_t*, const UnpackOptions&);

} // namespace arrow::internal
129 changes: 109 additions & 20 deletions cpp/src/arrow/util/bpacking_benchmark.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.

#include <memory>
#include <stdexcept>
#include <vector>

Expand All @@ -33,7 +34,7 @@ namespace arrow::internal {
namespace {

template <typename Int>
using UnpackFunc = void (*)(const uint8_t*, Int*, int, int, int);
using UnpackFunc = void (*)(const uint8_t*, Int*, const UnpackOptions&);

/// Get the number of bytes associate with a packing.
constexpr int32_t GetNumBytes(int32_t num_values, int32_t bit_width) {
Expand Down Expand Up @@ -86,33 +87,62 @@ void BM_Unpack(benchmark::State& state, bool aligned, UnpackFunc<Int> unpack, bo
const uint8_t* packed_ptr =
GetNextAlignedByte(packed.data(), sizeof(Int)) + (aligned ? 0 : 1);

std::vector<Int> unpacked(num_values, 0);
auto unpacked = std::make_unique<Int[]>(num_values);

const ::arrow::internal::UnpackOptions opts{
.batch_size = num_values,
.bit_width = bit_width,
.bit_offset = 0,
.max_read_bytes = -1,
};

for (auto _ : state) {
unpack(packed_ptr, unpacked.data(), num_values, bit_width, /* bit_offset = */ 0);
unpack(packed_ptr, unpacked.get(), opts);
benchmark::ClobberMemory();
}
state.SetItemsProcessed(num_values * state.iterations());
}

constexpr int32_t kMinRange = 64;
constexpr int32_t kMaxRange = 32768;
// Currently, the minimum unpack SIMD kernel size is 32 and the RLE-bit-packing encoder
// will not emit runs larger than 512 (though other implementation might), so we biased
// the benchmarks towards a rather small scale.
static const auto kNumValuesRange = benchmark::CreateRange(32, 512, 2);
constexpr std::initializer_list<int64_t> kBitWidths8 = {1, 2, 8};
constexpr std::initializer_list<int64_t> kBitWidths16 = {1, 2, 8, 13};
constexpr std::initializer_list<int64_t> kBitWidths32 = {1, 2, 8, 20};
constexpr std::initializer_list<int64_t> kBitWidths64 = {1, 2, 8, 20, 47};

static const std::vector<std::vector<int64_t>> kBitWidthsNumValuesBool = {
{0, 1},
kNumValuesRange,
};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues8 = {
kBitWidths8,
kNumValuesRange,
};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues16 = {
kBitWidths16,
benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32),
kNumValuesRange,
};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues32 = {
kBitWidths32,
benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32),
kNumValuesRange,
};
static const std::vector<std::vector<int64_t>> kBitWidthsNumValues64 = {
kBitWidths64,
benchmark::CreateRange(kMinRange, kMaxRange, /*multi=*/32),
kNumValuesRange,
};

/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro.
void BM_UnpackBool(benchmark::State& state, bool aligned, UnpackFunc<bool> unpack,
bool skip = false, std::string skip_msg = "") {
return BM_Unpack<bool>(state, aligned, unpack, skip, std::move(skip_msg));
}
/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro.
void BM_UnpackUint8(benchmark::State& state, bool aligned, UnpackFunc<uint8_t> unpack,
bool skip = false, std::string skip_msg = "") {
return BM_Unpack<uint8_t>(state, aligned, unpack, skip, std::move(skip_msg));
}
/// Nudge for MSVC template inside BENCHMARK_CAPTURE macro.
void BM_UnpackUint16(benchmark::State& state, bool aligned, UnpackFunc<uint16_t> unpack,
bool skip = false, std::string skip_msg = "") {
Expand All @@ -129,52 +159,111 @@ void BM_UnpackUint64(benchmark::State& state, bool aligned, UnpackFunc<uint64_t>
return BM_Unpack<uint64_t>(state, aligned, unpack, skip, std::move(skip_msg));
}

BENCHMARK_CAPTURE(BM_UnpackUint16, ScalarUnaligned, false, &unpack_scalar<uint16_t>)
BENCHMARK_CAPTURE(BM_UnpackBool, ScalarUnaligned, false, &bpacking::unpack_scalar<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, ScalarUnaligned, false,
&bpacking::unpack_scalar<uint8_t>)
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, ScalarUnaligned, false,
&bpacking::unpack_scalar<uint16_t>)
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false, &unpack_scalar<uint32_t>)
BENCHMARK_CAPTURE(BM_UnpackUint32, ScalarUnaligned, false,
&bpacking::unpack_scalar<uint32_t>)
->ArgsProduct(kBitWidthsNumValues32);
BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false, &unpack_scalar<uint64_t>)
BENCHMARK_CAPTURE(BM_UnpackUint64, ScalarUnaligned, false,
&bpacking::unpack_scalar<uint64_t>)
->ArgsProduct(kBitWidthsNumValues64);

#if defined(ARROW_HAVE_SSE4_2)
BENCHMARK_CAPTURE(BM_UnpackBool, Sse42Unaligned, false, &bpacking::unpack_sse4_2<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, Sse42Unaligned, false,
&bpacking::unpack_sse4_2<uint8_t>)
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, Sse42Unaligned, false,
&bpacking::unpack_sse4_2<uint16_t>)
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint32, Sse42Unaligned, false,
&bpacking::unpack_sse4_2<uint32_t>)
->ArgsProduct(kBitWidthsNumValues32);
BENCHMARK_CAPTURE(BM_UnpackUint64, Sse42Unaligned, false,
&bpacking::unpack_sse4_2<uint64_t>)
->ArgsProduct(kBitWidthsNumValues64);
#endif

#if defined(ARROW_HAVE_RUNTIME_AVX2)
BENCHMARK_CAPTURE(BM_UnpackUint16, Avx2Unaligned, false, &unpack_avx2<uint16_t>,
BENCHMARK_CAPTURE(BM_UnpackBool, Avx2Unaligned, false, &bpacking::unpack_avx2<bool>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
"Avx2 not available")
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, Avx2Unaligned, false, &bpacking::unpack_avx2<uint8_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
"Avx2 not available")
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, Avx2Unaligned, false, &bpacking::unpack_avx2<uint16_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
"Avx2 not available")
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint32, Avx2Unaligned, false, &unpack_avx2<uint32_t>,
BENCHMARK_CAPTURE(BM_UnpackUint32, Avx2Unaligned, false, &bpacking::unpack_avx2<uint32_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
"Avx2 not available")
->ArgsProduct(kBitWidthsNumValues32);
BENCHMARK_CAPTURE(BM_UnpackUint64, Avx2Unaligned, false, &unpack_avx2<uint64_t>,
BENCHMARK_CAPTURE(BM_UnpackUint64, Avx2Unaligned, false, &bpacking::unpack_avx2<uint64_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX2),
"Avx2 not available")
->ArgsProduct(kBitWidthsNumValues64);
#endif

#if defined(ARROW_HAVE_RUNTIME_AVX512)
BENCHMARK_CAPTURE(BM_UnpackUint16, Avx512Unaligned, false, &unpack_avx512<uint16_t>,
BENCHMARK_CAPTURE(BM_UnpackBool, Avx512Unaligned, false, &bpacking::unpack_avx512<bool>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
"Avx512 not available")
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, Avx512Unaligned, false,
&bpacking::unpack_avx512<uint8_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
"Avx512 not available")
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, Avx512Unaligned, false,
&bpacking::unpack_avx512<uint16_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
"Avx512 not available")
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint32, Avx512Unaligned, false, &unpack_avx512<uint32_t>,
BENCHMARK_CAPTURE(BM_UnpackUint32, Avx512Unaligned, false,
&bpacking::unpack_avx512<uint32_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
"Avx512 not available")
->ArgsProduct(kBitWidthsNumValues32);
BENCHMARK_CAPTURE(BM_UnpackUint64, Avx512Unaligned, false, &unpack_avx512<uint64_t>,
BENCHMARK_CAPTURE(BM_UnpackUint64, Avx512Unaligned, false,
&bpacking::unpack_avx512<uint64_t>,
!CpuInfo::GetInstance()->IsSupported(CpuInfo::AVX512),
"Avx512 not available")
->ArgsProduct(kBitWidthsNumValues64);
#endif

#if defined(ARROW_HAVE_NEON)
BENCHMARK_CAPTURE(BM_UnpackUint16, NeonUnaligned, false, &unpack_neon<uint16_t>)
BENCHMARK_CAPTURE(BM_UnpackBool, NeonUnaligned, false, &bpacking::unpack_neon<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackUint8, NeonUnaligned, false, &bpacking::unpack_neon<uint8_t>)
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint16, NeonUnaligned, false, &bpacking::unpack_neon<uint16_t>)
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false, &unpack_neon<uint32_t>)
BENCHMARK_CAPTURE(BM_UnpackUint32, NeonUnaligned, false, &bpacking::unpack_neon<uint32_t>)
->ArgsProduct(kBitWidthsNumValues32);
BENCHMARK_CAPTURE(BM_UnpackUint64, NeonUnaligned, false, &unpack_neon<uint64_t>)
BENCHMARK_CAPTURE(BM_UnpackUint64, NeonUnaligned, false, &bpacking::unpack_neon<uint64_t>)
->ArgsProduct(kBitWidthsNumValues64);
#endif

BENCHMARK_CAPTURE(BM_UnpackBool, DynamicAligned, true, &unpack<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);
BENCHMARK_CAPTURE(BM_UnpackBool, DynamicUnaligned, false, &unpack<bool>)
->ArgsProduct(kBitWidthsNumValuesBool);

BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicAligned, true, &unpack<uint8_t>)
->ArgsProduct(kBitWidthsNumValues8);
BENCHMARK_CAPTURE(BM_UnpackUint8, DynamicUnaligned, false, &unpack<uint8_t>)
->ArgsProduct(kBitWidthsNumValues8);

BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicAligned, true, &unpack<uint16_t>)
->ArgsProduct(kBitWidthsNumValues16);
BENCHMARK_CAPTURE(BM_UnpackUint16, DynamicUnaligned, false, &unpack<uint16_t>)
Expand Down
Loading
Loading