From 29be7f2828c321e220a05cff021334eec9d9f44b Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 25 May 2023 23:32:31 -0300 Subject: [PATCH 01/15] Start with the 64-bit-aligned version of MurmurHash2 As a basis for what is going to be a bitmap hashing function. --- cpp/src/arrow/util/hashing.cc | 83 +++++++++++++++++++++++++++++++++++ cpp/src/arrow/util/hashing.h | 3 ++ 2 files changed, 86 insertions(+) create mode 100644 cpp/src/arrow/util/hashing.cc diff --git a/cpp/src/arrow/util/hashing.cc b/cpp/src/arrow/util/hashing.cc new file mode 100644 index 000000000000..aec483837668 --- /dev/null +++ b/cpp/src/arrow/util/hashing.cc @@ -0,0 +1,83 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/hashing.h" + +namespace arrow { +namespace internal { + +namespace { + +// MurmurHash2, 64-bit versions, by Austin Appleby + +uint64_t MurmurHash64A(const void* buffer, int len, uint64_t seed) { + const uint64_t m = 0xc6a4a7935bd1e995LLU; + const int r = 47; + + uint64_t h = seed ^ (len * m); + + const auto* data = static_cast(buffer); + const uint64_t* end = data + (len / 8); + + while (data != end) { + uint64_t k = *data++; + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + const auto* data2 = reinterpret_cast(data); + + switch (len & 7) { + case 7: + h ^= static_cast(data2[6]) << 48; + case 6: + h ^= static_cast(data2[5]) << 40; + case 5: + h ^= static_cast(data2[4]) << 32; + case 4: + h ^= static_cast(data2[3]) << 24; + case 3: + h ^= static_cast(data2[2]) << 16; + case 2: + h ^= static_cast(data2[1]) << 8; + case 1: + h ^= static_cast(data2[0]); + h *= m; + }; + + h ^= h >> r; + h *= m; + h ^= h >> r; + + return h; +} + +} // namespace + +hash_t ComputeBitmapHash(const uint8_t* bitmap, int64_t length, int64_t bit_offset, + int64_t num_bits) { + hash_t seed = 0; // XXX: add seed parameter + return 0; +} + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h index 6656492b3f09..7954812662f6 100644 --- a/cpp/src/arrow/util/hashing.h +++ b/cpp/src/arrow/util/hashing.h @@ -63,6 +63,9 @@ typedef uint64_t hash_t; template inline hash_t ComputeStringHash(const void* data, int64_t length); +hash_t ComputeBitmapHash(const uint8_t* bitmap, int64_t length, int64_t bit_offset, + int64_t num_bits); + template struct ScalarHelperBase { static bool CompareScalars(Scalar u, Scalar v) { return u == v; } From 9f5758f9b23e7e5219d7fa5a0f3a42314e56c482 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 26 May 2023 12:17:47 -0300 Subject: [PATCH 02/15] A hash function for bitmaps that can handle offset and length --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/util/hashing.cc | 168 +++++++++++++++++++++-------- cpp/src/arrow/util/hashing.h | 22 +++- cpp/src/arrow/util/hashing_test.cc | 167 ++++++++++++++++++++++++++++ 4 files changed, 314 insertions(+), 44 deletions(-) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index d928cdf58b11..84539755489b 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -216,6 +216,7 @@ set(ARROW_SRCS util/delimiting.cc util/formatting.cc util/future.cc + util/hashing.cc util/int_util.cc util/io_util.cc util/logging.cc diff --git a/cpp/src/arrow/util/hashing.cc b/cpp/src/arrow/util/hashing.cc index aec483837668..c54ecc64f1f8 100644 --- a/cpp/src/arrow/util/hashing.cc +++ b/cpp/src/arrow/util/hashing.cc @@ -16,67 +16,151 @@ // under the License. #include "arrow/util/hashing.h" +#include "arrow/util/macros.h" namespace arrow { namespace internal { namespace { -// MurmurHash2, 64-bit versions, by Austin Appleby - -uint64_t MurmurHash64A(const void* buffer, int len, uint64_t seed) { +/// \brief Rotate-right, 64-bit. +/// +/// This compiles to a single instruction on CPUs that have rotation instructions. +/// +/// \pre n must be in the range [1, 64]. +#define ROTR64(v, n) (((v) >> (n)) | ((v) << (64 - (n)))) + +/// \brief A hash function for bitmaps that can handle offsets and lengths in +/// terms of number of bits. The hash only depends on the bits actually hashed. +/// +/// This implementation is based on 64-bit versions of MurmurHash2 by Austin Appleby. +/// +/// The key (a bitmap) is read as a sequence of 64-bit words before the trailing +/// bytes. So if the input is 64-bit aligned, all memory accesses are aligned. +/// +/// It's the caller's responsibility to ensure that bits_offset + num_bits are +/// readable from the bitmap. +/// +/// \param key The pointer to the bitmap. +/// \param seed The seed for the hash function (useful when chaining hash functions). +/// \param bits_offset The offset in bits relative to the start of the bitmap. +/// \param num_bits The number of bits after the offset to be hashed. +uint64_t MurmurHashBitmap64A(const uint8_t* key, uint64_t seed, uint64_t bits_offset, + uint64_t num_bits) { const uint64_t m = 0xc6a4a7935bd1e995LLU; const int r = 47; - uint64_t h = seed ^ (len * m); - - const auto* data = static_cast(buffer); - const uint64_t* end = data + (len / 8); - - while (data != end) { - uint64_t k = *data++; - - k *= m; - k ^= k >> r; - k *= m; - - h ^= k; - h *= m; + uint64_t h = seed ^ (num_bits * m); + +#define HASHING_ROUND(k) \ + (k) *= m; \ + (k) ^= (k) >> r; \ + (k) *= m; \ + \ + h ^= (k); \ + h *= m + + // Shift key pointer by as many words as possible. + key += (bits_offset / 64) * 8; + // The shift within each word. + const uint64_t shift = bits_offset % 64; + const uint64_t readable_bits = shift + num_bits; + + const auto* data = reinterpret_cast(key); + const auto* end = data + (readable_bits / 64); + if (data == end) { + // The bitmap is entirely contained in a single word, but not all bytes of + // the word are necessarily accessible, so access is performed byte-by-byte. + // ROTR644 is not necessary and shift-right is safe because readable_bits < 64 + // when data == end. + if (num_bits > 0) { + // clang-format off + switch (uint64_t k0 = 0; (readable_bits + 7) / 8) { + case 8: k0 |= static_cast(key[7]) << 56; + case 7: k0 |= static_cast(key[6]) << 48; + case 6: k0 |= static_cast(key[5]) << 40; + case 5: k0 |= static_cast(key[4]) << 32; + case 4: k0 |= static_cast(key[3]) << 24; + case 3: k0 |= static_cast(key[2]) << 16; + case 2: k0 |= static_cast(key[1]) << 8; + case 1: k0 |= static_cast(key[0]); + k0 &= (0x1LLU << readable_bits) - 1; + k0 >>= shift; + HASHING_ROUND(k0); + } + // clang-format on + } + } else { + const uint64_t lsb_mask = shift ? (0x1LLU << shift) - 1 : -1; + const uint64_t msb_mask = ~lsb_mask; + uint64_t k0 = 0; + if (ARROW_PREDICT_TRUE(shift == 0)) { + // Fast case: no shift. + do { + uint64_t k1 = *data; + ++data; + HASHING_ROUND(k1); + } while (data != end); + } else { + // General case: shift. + k0 = *data & msb_mask; + ++data; + while (data != end) { + uint64_t k1 = k0 | (*data & lsb_mask); + if (shift > 0) { + k1 = ROTR64(k1, shift); + } + k0 = *data & msb_mask; + ++data; + HASHING_ROUND(k1); + } + } + + const auto* trailing = reinterpret_cast(data); + const auto trailing_bits = readable_bits % 64; + uint64_t k1 = k0; + // clang-format off + switch (uint64_t final_blk = 0; (trailing_bits + 7) / 8) { + case 8: final_blk |= static_cast(trailing[7]) << 56; + case 7: final_blk |= static_cast(trailing[6]) << 48; + case 6: final_blk |= static_cast(trailing[5]) << 40; + case 5: final_blk |= static_cast(trailing[4]) << 32; + case 4: final_blk |= static_cast(trailing[3]) << 24; + case 3: final_blk |= static_cast(trailing[2]) << 16; + case 2: final_blk |= static_cast(trailing[1]) << 8; + case 1: final_blk |= static_cast(trailing[0]); + final_blk &= (0x1LLU << trailing_bits) - 1; + // Combine with the last key block (k0). + k1 |= final_blk & lsb_mask; + if (shift > 0) { + k1 = ROTR64(k1, shift); + } + k0 = final_blk & msb_mask; + HASHING_ROUND(k1); + } + // clang-format on + // Perform a final round with bits from k0 if it still contains relevant bits. + if (shift > 0 && (trailing_bits == 0 || trailing_bits > shift)) { + k1 = ROTR64(k0, shift); + HASHING_ROUND(k1); + } } - const auto* data2 = reinterpret_cast(data); - - switch (len & 7) { - case 7: - h ^= static_cast(data2[6]) << 48; - case 6: - h ^= static_cast(data2[5]) << 40; - case 5: - h ^= static_cast(data2[4]) << 32; - case 4: - h ^= static_cast(data2[3]) << 24; - case 3: - h ^= static_cast(data2[2]) << 16; - case 2: - h ^= static_cast(data2[1]) << 8; - case 1: - h ^= static_cast(data2[0]); - h *= m; - }; - + // Finalize. h ^= h >> r; h *= m; h ^= h >> r; - return h; } } // namespace -hash_t ComputeBitmapHash(const uint8_t* bitmap, int64_t length, int64_t bit_offset, - int64_t num_bits) { - hash_t seed = 0; // XXX: add seed parameter - return 0; +hash_t ComputeBitmapHash(const uint8_t* bitmap, int64_t length, hash_t seed, + int64_t bits_offset, int64_t num_bits) { + DCHECK_GE(bits_offset, 0); + DCHECK_GE(num_bits, 0); + DCHECK_LE((bits_offset + num_bits + 7) / 8, length); + return MurmurHashBitmap64A(bitmap, seed, bits_offset, num_bits); } } // namespace internal diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h index 7954812662f6..b73e97bc9b7f 100644 --- a/cpp/src/arrow/util/hashing.h +++ b/cpp/src/arrow/util/hashing.h @@ -63,8 +63,26 @@ typedef uint64_t hash_t; template inline hash_t ComputeStringHash(const void* data, int64_t length); -hash_t ComputeBitmapHash(const uint8_t* bitmap, int64_t length, int64_t bit_offset, - int64_t num_bits); +/// \brief A hash function for bitmaps that can handle offsets and lengths in +/// terms of number of bits. The hash only depends on the bits actually hashed. +/// +/// The key (a bitmap) is read as a sequence of 64-bit words before the trailing +/// bytes. So if the input is 64-bit aligned, all memory accesses are aligned. +/// +/// It's the caller's responsibility to ensure that bits_offset + num_bits are +/// readable from the bitmap. +/// +/// \pre bits_offset >= 0 +/// \pre num_bits >= 0 +/// \pre (bits_offset + num_bits + 7) / 8 <= length +/// +/// \param bitmap The pointer to the bitmap. +/// \param length The length of the bitmap in bytes. +/// \param seed The seed for the hash function (useful when chaining hash functions). +/// \param bits_offset The offset in bits relative to the start of the bitmap. +/// \param num_bits The number of bits after the offset to be hashed. +hash_t ComputeBitmapHash(const uint8_t* bitmap, int64_t length, hash_t seed, + int64_t bits_offset, int64_t num_bits); template struct ScalarHelperBase { diff --git a/cpp/src/arrow/util/hashing_test.cc b/cpp/src/arrow/util/hashing_test.cc index ac212e9fb4fa..bb827bcb9b2e 100644 --- a/cpp/src/arrow/util/hashing_test.cc +++ b/cpp/src/arrow/util/hashing_test.cc @@ -28,6 +28,8 @@ #include #include +#include "arrow/array/builder_primitive.h" +#include "arrow/array/concatenate.h" #include "arrow/testing/gtest_util.h" #include "arrow/util/bit_util.h" #include "arrow/util/hashing.h" @@ -486,5 +488,170 @@ TEST(BinaryMemoTable, Empty) { EXPECT_EQ(offsets[0], 0); } +hash_t HashDataBitmap(const ArraySpan& array) { + EXPECT_EQ(array.type->id(), Type::BOOL); + const auto& bitmap = array.buffers[1]; + return ComputeBitmapHash(bitmap.data, bitmap.size, + /*seed=*/0, + /*bit_offset=*/array.offset, + /*num_bits=*/array.length); +} + +std::shared_ptr BuildBooleanArray(int len, bool start) { + // This could be memoized in the future to speed up tests. + BooleanBuilder builder; + for (int i = 0; i < len; ++i) { + EXPECT_TRUE(builder.Append(((i % 2) ^ start) == 1).ok()); + } + std::shared_ptr array; + EXPECT_TRUE(builder.Finish(&array).ok()); + return array; +} + +hash_t HashConcatenation(const ArrayVector& arrays, int64_t bits_offset = -1, + int64_t num_bits = -1) { + EXPECT_OK_AND_ASSIGN(auto concat, Concatenate(arrays)); + EXPECT_EQ(concat->type()->id(), Type::BOOL); + if (bits_offset == -1 || num_bits == -1) { + return HashDataBitmap(*concat->data()); + } + auto slice = concat->Slice(bits_offset, num_bits); + return HashDataBitmap(*slice->data()); +} + +TEST(SmallBitmapHash, Empty) { + for (bool start : {false, true}) { + auto block = BuildBooleanArray(64, start); + for (int len = 0; len < 64; len++) { + auto prefix = BuildBooleanArray(len, start); + auto expected_hash = HashDataBitmap(*prefix->data()); + + auto slice = block->Slice(0, len); + auto slice_hash = HashDataBitmap(*slice->data()); + ASSERT_EQ(expected_hash, slice_hash); + + for (int j = 1; j < len; j++) { + auto fragment = BuildBooleanArray(len - j, start ^ (j % 2)); + expected_hash = HashDataBitmap(*fragment->data()); + + slice = block->Slice(j, len - j); + slice_hash = HashDataBitmap(*slice->data()); + ASSERT_EQ(expected_hash, slice_hash); + } + } + } +} + +TEST(TestBitmapHash, Empty) { + BooleanBuilder builder; + std::shared_ptr block_of_bools; + { + ASSERT_OK(builder.AppendValues(2, true)); + ASSERT_OK(builder.AppendValues(3, false)); + ASSERT_OK(builder.AppendValues(5, true)); + ASSERT_OK(builder.AppendValues(7, false)); + ASSERT_OK(builder.AppendValues(11, true)); + ASSERT_OK(builder.AppendValues(13, false)); + ASSERT_OK(builder.AppendValues(17, true)); + ASSERT_OK(builder.AppendValues(5, false)); + ASSERT_OK(builder.AppendValues(1, true)); + ASSERT_OK(builder.Finish(&block_of_bools)); + ASSERT_EQ(block_of_bools->length(), 64); + } + const auto hash_of_block = HashDataBitmap(*block_of_bools->data()); + + std::shared_ptr negated_block_of_bools; + { + ASSERT_OK(builder.AppendValues(2, false)); + ASSERT_OK(builder.AppendValues(3, true)); + ASSERT_OK(builder.AppendValues(5, false)); + ASSERT_OK(builder.AppendValues(7, true)); + ASSERT_OK(builder.AppendValues(11, false)); + ASSERT_OK(builder.AppendValues(13, true)); + ASSERT_OK(builder.AppendValues(17, false)); + ASSERT_OK(builder.AppendValues(5, true)); + ASSERT_OK(builder.AppendValues(1, false)); + ASSERT_OK(builder.Finish(&negated_block_of_bools)); + ASSERT_EQ(negated_block_of_bools->length(), 64); + } + const auto hash_of_negated_block = HashDataBitmap(*negated_block_of_bools->data()); + + constexpr bool kSlowTests = true; + constexpr auto kMaxPadding = 64 + 32 + 1; + auto step = [&](int& i) { + const auto kStep = kSlowTests ? 1 : 8; + if (i + kStep >= kMaxPadding && i != kMaxPadding - 1) { + i = kMaxPadding - 1; + } else { + i += kStep; + } + }; + + for (int start_bits = 0; start_bits < 8; ++start_bits) { + for (int prefix_pad_len = 0; prefix_pad_len < kMaxPadding; step(prefix_pad_len)) { + auto prefix_pad = BuildBooleanArray(prefix_pad_len, start_bits & 0x1); + for (int suffix_pad_len = 0; suffix_pad_len < kMaxPadding; step(suffix_pad_len)) { + auto suffix_pad = BuildBooleanArray(suffix_pad_len, (start_bits >> 1) & 0x1); + + // A block of 64 bools in the middle + auto hash = HashConcatenation({prefix_pad, block_of_bools, suffix_pad}, + prefix_pad_len, 64); + ASSERT_EQ(hash, hash_of_block); + // Negated + hash = HashConcatenation({prefix_pad, negated_block_of_bools, suffix_pad}, + prefix_pad_len, 64); + ASSERT_EQ(hash, hash_of_negated_block); + + std::shared_ptr bools; + // Trailing bits and leading bits around a block + for (int trailing_len = 1; trailing_len < kMaxPadding; step(trailing_len)) { + auto trailing = BuildBooleanArray(trailing_len, (start_bits >> 1) & 0x1); + auto expected_hash = HashConcatenation({block_of_bools, trailing}); + auto hash = + HashConcatenation({prefix_pad, block_of_bools, trailing, suffix_pad}, + prefix_pad_len, 64 + trailing_len); + ASSERT_EQ(hash, expected_hash); + // Negated + expected_hash = HashConcatenation({negated_block_of_bools, trailing}); + hash = HashConcatenation( + {prefix_pad, negated_block_of_bools, trailing, suffix_pad}, prefix_pad_len, + 64 + trailing_len); + ASSERT_EQ(hash, expected_hash); + + // Use the trailing bits as leading bits now + auto leading = trailing; + auto leading_len = trailing_len; + expected_hash = HashConcatenation({leading, block_of_bools}); + hash = HashConcatenation({prefix_pad, leading, block_of_bools, suffix_pad}, + prefix_pad_len, leading_len + 64); + ASSERT_EQ(hash, expected_hash); + // Negated + expected_hash = HashConcatenation({leading, negated_block_of_bools}); + hash = + HashConcatenation({prefix_pad, leading, negated_block_of_bools, suffix_pad}, + prefix_pad_len, leading_len + 64); + ASSERT_EQ(hash, expected_hash); + + // leading and trailing at the same time + expected_hash = HashConcatenation({leading, block_of_bools, trailing}); + hash = HashConcatenation( + {prefix_pad, leading, block_of_bools, trailing, suffix_pad}, prefix_pad_len, + leading_len + 64 + trailing_len); + ASSERT_EQ(hash, expected_hash); + // Negated + expected_hash = HashConcatenation({leading, negated_block_of_bools, trailing}); + hash = HashConcatenation( + {prefix_pad, leading, negated_block_of_bools, trailing, suffix_pad}, + prefix_pad_len, leading_len + 64 + trailing_len); + ASSERT_EQ(hash, expected_hash); + } + } + } + if (!kSlowTests) { + break; + } + } +} + } // namespace internal } // namespace arrow From 20626cf7e74bc90d388799ba7ede8918ccde6399 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 29 May 2023 23:20:34 -0300 Subject: [PATCH 03/15] Disable slow tests of ComputeBitmapHash --- cpp/src/arrow/util/hashing_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/hashing_test.cc b/cpp/src/arrow/util/hashing_test.cc index bb827bcb9b2e..30a726178550 100644 --- a/cpp/src/arrow/util/hashing_test.cc +++ b/cpp/src/arrow/util/hashing_test.cc @@ -576,7 +576,7 @@ TEST(TestBitmapHash, Empty) { } const auto hash_of_negated_block = HashDataBitmap(*negated_block_of_bools->data()); - constexpr bool kSlowTests = true; + constexpr bool kSlowTests = false; constexpr auto kMaxPadding = 64 + 32 + 1; auto step = [&](int& i) { const auto kStep = kSlowTests ? 1 : 8; From b18c09e4f8d89380b19ae738d2cbd472e0a15370 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Fri, 26 May 2023 19:10:40 -0300 Subject: [PATCH 04/15] scalar.cc: Consider the length and offset when hashing the validity bitmap --- cpp/src/arrow/scalar.cc | 46 ++++++++++++++++++++++++---- python/pyarrow/tests/test_scalars.py | 9 ++++++ 2 files changed, 49 insertions(+), 6 deletions(-) diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 1e40266b3cae..d565031dada9 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -29,6 +29,7 @@ #include "arrow/compare.h" #include "arrow/pretty_print.h" #include "arrow/type.h" +#include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" #include "arrow/util/formatting.h" @@ -151,20 +152,53 @@ struct ScalarHashImpl { Status ArrayHash(const Array& a) { return ArrayHash(*a.data()); } - Status ArrayHash(const ArrayData& a) { - RETURN_NOT_OK(StdHash(a.length) & StdHash(a.GetNullCount())); - if (a.GetNullCount() != 0 && a.buffers[0] != nullptr) { + Status ArrayHash(const ArraySpan& a, int64_t offset, int64_t length) { + // Calculate null count within the range + const auto* validity = a.buffers[0].data; + const int64_t validity_size = a.buffers[0].size; + int64_t null_count = 0; + if (validity != NULLPTR) { + if (offset == a.offset && length == a.length) { + null_count = a.GetNullCount(); + } else { + null_count = length - internal::CountSetBits(validity, offset, length); + } + } + + RETURN_NOT_OK(StdHash(length) & StdHash(null_count)); + if (null_count != 0) { // We can't visit values without unboxing the whole array, so only hash // the null bitmap for now. Only hash the null bitmap if the null count // is not 0 to ensure hash consistency. - RETURN_NOT_OK(BufferHash(*a.buffers[0])); + hash_ = internal::ComputeBitmapHash(validity, validity_size, /*seed=*/hash_, + /*bits_offset=*/offset, /*num_bits=*/length); } - for (const auto& child : a.child_data) { - RETURN_NOT_OK(ArrayHash(*child)); + + // Hash the relevant child arrays for each type taking offset and length + // from the parent array into account if necessary. + switch (a.type->id()) { + case Type::RUN_END_ENCODED: + // Hash only the values, not the run lengths. + RETURN_NOT_OK(ArrayHash(a.child_data[1], offset, length)); + break; + case Type::STRUCT: + for (const auto& child : a.child_data) { + RETURN_NOT_OK(ArrayHash(child, offset, length)); + } + break; + default: + // By default, just hash the arrays without considering + // the offset and length of the parent. + for (const auto& child : a.child_data) { + RETURN_NOT_OK(ArrayHash(child)); + } + break; } return Status::OK(); } + Status ArrayHash(const ArraySpan& a) { return ArrayHash(a, a.offset, a.length); } + explicit ScalarHashImpl(const Scalar& scalar) : hash_(scalar.type->Hash()) { AccumulateHashFrom(scalar); } diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index ca2d29e5dac2..1e6a3f29e0d7 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -143,6 +143,15 @@ def test_hashing(): assert len(set_from_array) == 500 +def test_hashing_struct_scalar(): + # GH-35360 + a = pa.array([[{'a': 5}, {'a': 6}], [{'a': 7}, None]]) + b = pa.array([[{'a': 7}, None]]) + hash1 = hash(a[1]) + hash2 = hash(b[0]) + assert hash1 == hash2 + + def test_bool(): false = pa.scalar(False) true = pa.scalar(True) From b6d6671421c80397599eeb162acd1679c391dfdc Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Mon, 29 May 2023 23:46:28 -0300 Subject: [PATCH 05/15] Add export annotation for Windows builds --- cpp/src/arrow/util/hashing.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h index b73e97bc9b7f..86993adab1fe 100644 --- a/cpp/src/arrow/util/hashing.h +++ b/cpp/src/arrow/util/hashing.h @@ -81,8 +81,8 @@ inline hash_t ComputeStringHash(const void* data, int64_t length); /// \param seed The seed for the hash function (useful when chaining hash functions). /// \param bits_offset The offset in bits relative to the start of the bitmap. /// \param num_bits The number of bits after the offset to be hashed. -hash_t ComputeBitmapHash(const uint8_t* bitmap, int64_t length, hash_t seed, - int64_t bits_offset, int64_t num_bits); +ARROW_EXPORT hash_t ComputeBitmapHash(const uint8_t* bitmap, int64_t length, hash_t seed, + int64_t bits_offset, int64_t num_bits); template struct ScalarHelperBase { From b7c9216cc12527edaae5dcfc865beca629305a2c Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 30 May 2023 13:11:28 -0300 Subject: [PATCH 06/15] Add big-endian support to MurmurHashBitmap64A --- cpp/src/arrow/util/hashing.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/util/hashing.cc b/cpp/src/arrow/util/hashing.cc index c54ecc64f1f8..398960cbed92 100644 --- a/cpp/src/arrow/util/hashing.cc +++ b/cpp/src/arrow/util/hashing.cc @@ -97,7 +97,7 @@ uint64_t MurmurHashBitmap64A(const uint8_t* key, uint64_t seed, uint64_t bits_of if (ARROW_PREDICT_TRUE(shift == 0)) { // Fast case: no shift. do { - uint64_t k1 = *data; + uint64_t k1 = bit_util::ToLittleEndian(*data); ++data; HASHING_ROUND(k1); } while (data != end); @@ -106,11 +106,12 @@ uint64_t MurmurHashBitmap64A(const uint8_t* key, uint64_t seed, uint64_t bits_of k0 = *data & msb_mask; ++data; while (data != end) { - uint64_t k1 = k0 | (*data & lsb_mask); + auto data0_le = bit_util::ToLittleEndian(*data); + uint64_t k1 = k0 | (data0_le & lsb_mask); if (shift > 0) { k1 = ROTR64(k1, shift); } - k0 = *data & msb_mask; + k0 = data0_le & msb_mask; ++data; HASHING_ROUND(k1); } From b79745003efd87d0dde24793dc054e5ce215a083 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 30 May 2023 13:55:04 -0300 Subject: [PATCH 07/15] Label the tests correctly --- cpp/src/arrow/util/hashing_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/util/hashing_test.cc b/cpp/src/arrow/util/hashing_test.cc index 30a726178550..c14421c40d7b 100644 --- a/cpp/src/arrow/util/hashing_test.cc +++ b/cpp/src/arrow/util/hashing_test.cc @@ -519,7 +519,7 @@ hash_t HashConcatenation(const ArrayVector& arrays, int64_t bits_offset = -1, return HashDataBitmap(*slice->data()); } -TEST(SmallBitmapHash, Empty) { +TEST(BitmapHashTest, SmallInputs) { for (bool start : {false, true}) { auto block = BuildBooleanArray(64, start); for (int len = 0; len < 64; len++) { @@ -542,7 +542,7 @@ TEST(SmallBitmapHash, Empty) { } } -TEST(TestBitmapHash, Empty) { +TEST(BitmapHashTest, LongerInputs) { BooleanBuilder builder; std::shared_ptr block_of_bools; { From 0e7546c459adf89cea65941de05295b9bdf1351c Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 30 May 2023 13:58:43 -0300 Subject: [PATCH 08/15] Replace the unrolled loops with BitmapWordReader --- cpp/src/arrow/util/hashing.cc | 113 +++++++--------------------------- cpp/src/arrow/util/hashing.h | 3 - 2 files changed, 21 insertions(+), 95 deletions(-) diff --git a/cpp/src/arrow/util/hashing.cc b/cpp/src/arrow/util/hashing.cc index 398960cbed92..c6d48e6026b1 100644 --- a/cpp/src/arrow/util/hashing.cc +++ b/cpp/src/arrow/util/hashing.cc @@ -16,6 +16,8 @@ // under the License. #include "arrow/util/hashing.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_reader.h" #include "arrow/util/macros.h" namespace arrow { @@ -35,9 +37,6 @@ namespace { /// /// This implementation is based on 64-bit versions of MurmurHash2 by Austin Appleby. /// -/// The key (a bitmap) is read as a sequence of 64-bit words before the trailing -/// bytes. So if the input is 64-bit aligned, all memory accesses are aligned. -/// /// It's the caller's responsibility to ensure that bits_offset + num_bits are /// readable from the bitmap. /// @@ -45,8 +44,8 @@ namespace { /// \param seed The seed for the hash function (useful when chaining hash functions). /// \param bits_offset The offset in bits relative to the start of the bitmap. /// \param num_bits The number of bits after the offset to be hashed. -uint64_t MurmurHashBitmap64A(const uint8_t* key, uint64_t seed, uint64_t bits_offset, - uint64_t num_bits) { +uint64_t MurmurHashBitmap64(const uint8_t* key, uint64_t seed, uint64_t bits_offset, + uint64_t num_bits) { const uint64_t m = 0xc6a4a7935bd1e995LLU; const int r = 47; @@ -60,91 +59,21 @@ uint64_t MurmurHashBitmap64A(const uint8_t* key, uint64_t seed, uint64_t bits_of h ^= (k); \ h *= m - // Shift key pointer by as many words as possible. - key += (bits_offset / 64) * 8; - // The shift within each word. - const uint64_t shift = bits_offset % 64; - const uint64_t readable_bits = shift + num_bits; - - const auto* data = reinterpret_cast(key); - const auto* end = data + (readable_bits / 64); - if (data == end) { - // The bitmap is entirely contained in a single word, but not all bytes of - // the word are necessarily accessible, so access is performed byte-by-byte. - // ROTR644 is not necessary and shift-right is safe because readable_bits < 64 - // when data == end. - if (num_bits > 0) { - // clang-format off - switch (uint64_t k0 = 0; (readable_bits + 7) / 8) { - case 8: k0 |= static_cast(key[7]) << 56; - case 7: k0 |= static_cast(key[6]) << 48; - case 6: k0 |= static_cast(key[5]) << 40; - case 5: k0 |= static_cast(key[4]) << 32; - case 4: k0 |= static_cast(key[3]) << 24; - case 3: k0 |= static_cast(key[2]) << 16; - case 2: k0 |= static_cast(key[1]) << 8; - case 1: k0 |= static_cast(key[0]); - k0 &= (0x1LLU << readable_bits) - 1; - k0 >>= shift; - HASHING_ROUND(k0); - } - // clang-format on - } - } else { - const uint64_t lsb_mask = shift ? (0x1LLU << shift) - 1 : -1; - const uint64_t msb_mask = ~lsb_mask; - uint64_t k0 = 0; - if (ARROW_PREDICT_TRUE(shift == 0)) { - // Fast case: no shift. - do { - uint64_t k1 = bit_util::ToLittleEndian(*data); - ++data; - HASHING_ROUND(k1); - } while (data != end); - } else { - // General case: shift. - k0 = *data & msb_mask; - ++data; - while (data != end) { - auto data0_le = bit_util::ToLittleEndian(*data); - uint64_t k1 = k0 | (data0_le & lsb_mask); - if (shift > 0) { - k1 = ROTR64(k1, shift); - } - k0 = data0_le & msb_mask; - ++data; - HASHING_ROUND(k1); - } - } - - const auto* trailing = reinterpret_cast(data); - const auto trailing_bits = readable_bits % 64; - uint64_t k1 = k0; - // clang-format off - switch (uint64_t final_blk = 0; (trailing_bits + 7) / 8) { - case 8: final_blk |= static_cast(trailing[7]) << 56; - case 7: final_blk |= static_cast(trailing[6]) << 48; - case 6: final_blk |= static_cast(trailing[5]) << 40; - case 5: final_blk |= static_cast(trailing[4]) << 32; - case 4: final_blk |= static_cast(trailing[3]) << 24; - case 3: final_blk |= static_cast(trailing[2]) << 16; - case 2: final_blk |= static_cast(trailing[1]) << 8; - case 1: final_blk |= static_cast(trailing[0]); - final_blk &= (0x1LLU << trailing_bits) - 1; - // Combine with the last key block (k0). - k1 |= final_blk & lsb_mask; - if (shift > 0) { - k1 = ROTR64(k1, shift); - } - k0 = final_blk & msb_mask; - HASHING_ROUND(k1); - } - // clang-format on - // Perform a final round with bits from k0 if it still contains relevant bits. - if (shift > 0 && (trailing_bits == 0 || trailing_bits > shift)) { - k1 = ROTR64(k0, shift); - HASHING_ROUND(k1); - } + BitmapWordReader reader(key, bits_offset, num_bits); + auto nwords = reader.words(); + while (nwords--) { + auto k = reader.NextWord(); + HASHING_ROUND(k); + } + auto nbytes = reader.trailing_bytes(); + if (nbytes) { + uint64_t k = 0; + do { + int valid_bits; + auto byte = reader.NextTrailingByte(valid_bits); + k = (k << 8) | static_cast(byte); + } while (--nbytes); + HASHING_ROUND(k); } // Finalize. @@ -160,8 +89,8 @@ hash_t ComputeBitmapHash(const uint8_t* bitmap, int64_t length, hash_t seed, int64_t bits_offset, int64_t num_bits) { DCHECK_GE(bits_offset, 0); DCHECK_GE(num_bits, 0); - DCHECK_LE((bits_offset + num_bits + 7) / 8, length); - return MurmurHashBitmap64A(bitmap, seed, bits_offset, num_bits); + DCHECK_LE(bit_util::BytesForBits(bits_offset + num_bits), length); + return MurmurHashBitmap64(bitmap, seed, bits_offset, num_bits); } } // namespace internal diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h index 86993adab1fe..dc79b4ff8008 100644 --- a/cpp/src/arrow/util/hashing.h +++ b/cpp/src/arrow/util/hashing.h @@ -66,9 +66,6 @@ inline hash_t ComputeStringHash(const void* data, int64_t length); /// \brief A hash function for bitmaps that can handle offsets and lengths in /// terms of number of bits. The hash only depends on the bits actually hashed. /// -/// The key (a bitmap) is read as a sequence of 64-bit words before the trailing -/// bytes. So if the input is 64-bit aligned, all memory accesses are aligned. -/// /// It's the caller's responsibility to ensure that bits_offset + num_bits are /// readable from the bitmap. /// From bee4183ab64929aabca1a9a87f7c428fa7fcc079 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 30 May 2023 18:53:24 -0300 Subject: [PATCH 09/15] Make the last round shorter like in original MurmurHash --- cpp/src/arrow/util/hashing.cc | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/cpp/src/arrow/util/hashing.cc b/cpp/src/arrow/util/hashing.cc index c6d48e6026b1..e2720c99c0e8 100644 --- a/cpp/src/arrow/util/hashing.cc +++ b/cpp/src/arrow/util/hashing.cc @@ -51,32 +51,29 @@ uint64_t MurmurHashBitmap64(const uint8_t* key, uint64_t seed, uint64_t bits_off uint64_t h = seed ^ (num_bits * m); -#define HASHING_ROUND(k) \ - (k) *= m; \ - (k) ^= (k) >> r; \ - (k) *= m; \ - \ - h ^= (k); \ - h *= m - BitmapWordReader reader(key, bits_offset, num_bits); auto nwords = reader.words(); while (nwords--) { auto k = reader.NextWord(); - HASHING_ROUND(k); + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; } + int valid_bits; auto nbytes = reader.trailing_bytes(); if (nbytes) { uint64_t k = 0; do { - int valid_bits; auto byte = reader.NextTrailingByte(valid_bits); k = (k << 8) | static_cast(byte); } while (--nbytes); - HASHING_ROUND(k); + h ^= k; + h *= m; } - // Finalize. h ^= h >> r; h *= m; h ^= h >> r; From 1ea600f17cbd792a1b7a7d0a646c6fb947d43edf Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 30 May 2023 14:14:21 -0300 Subject: [PATCH 10/15] Review the tests - Simplify the tests by dropping many cases - Add tests on the C++ side as well --- cpp/src/arrow/scalar.cc | 6 +- cpp/src/arrow/scalar_test.cc | 10 +++ cpp/src/arrow/util/hashing_test.cc | 127 +++++++++-------------------- 3 files changed, 49 insertions(+), 94 deletions(-) diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index d565031dada9..3d32895655bc 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -177,15 +177,13 @@ struct ScalarHashImpl { // Hash the relevant child arrays for each type taking offset and length // from the parent array into account if necessary. switch (a.type->id()) { - case Type::RUN_END_ENCODED: - // Hash only the values, not the run lengths. - RETURN_NOT_OK(ArrayHash(a.child_data[1], offset, length)); - break; case Type::STRUCT: for (const auto& child : a.child_data) { RETURN_NOT_OK(ArrayHash(child, offset, length)); } break; + // TODO(GH-35830): Investigate what should be the correct behavior for + // each nested type. default: // By default, just hash the arrays without considering // the offset and length of the parent. diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index 5d74eafcfcc0..089b25265ebe 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -1121,6 +1121,16 @@ class TestListScalar : public ::testing::Test { ASSERT_NE(set_bitmap_scalar->value->data()->buffers[0], nullptr); // ... yet it's hashing equal to the other scalar ASSERT_EQ(empty_bitmap_scalar.hash(), set_bitmap_scalar->hash()); + + // GH-35360: the hash value of a scalar from a list of structs should + // pay attention to the offset so it hashes the equivalent validity bitmap + auto list_struct_type = list(struct_({field("a", int64())})); + auto a = + ArrayFromJSON(list_struct_type, R"([[{"a": 5}, {"a": 6}], [{"a": 7}, null]])"); + auto b = ArrayFromJSON(list_struct_type, R"([[{"a": 7}, null]])"); + EXPECT_OK_AND_ASSIGN(auto a1, a->GetScalar(1)); + EXPECT_OK_AND_ASSIGN(auto b0, b->GetScalar(0)); + ASSERT_EQ(a1->hash(), b0->hash()); } protected: diff --git a/cpp/src/arrow/util/hashing_test.cc b/cpp/src/arrow/util/hashing_test.cc index c14421c40d7b..8a0be3ba342b 100644 --- a/cpp/src/arrow/util/hashing_test.cc +++ b/cpp/src/arrow/util/hashing_test.cc @@ -501,7 +501,7 @@ std::shared_ptr BuildBooleanArray(int len, bool start) { // This could be memoized in the future to speed up tests. BooleanBuilder builder; for (int i = 0; i < len; ++i) { - EXPECT_TRUE(builder.Append(((i % 2) ^ start) == 1).ok()); + EXPECT_TRUE(builder.Append(((i % 2 != 0) ^ start) == 1).ok()); } std::shared_ptr array; EXPECT_TRUE(builder.Finish(&array).ok()); @@ -531,7 +531,7 @@ TEST(BitmapHashTest, SmallInputs) { ASSERT_EQ(expected_hash, slice_hash); for (int j = 1; j < len; j++) { - auto fragment = BuildBooleanArray(len - j, start ^ (j % 2)); + auto fragment = BuildBooleanArray(len - j, start ^ (j % 2 != 0)); expected_hash = HashDataBitmap(*fragment->data()); slice = block->Slice(j, len - j); @@ -560,96 +560,43 @@ TEST(BitmapHashTest, LongerInputs) { } const auto hash_of_block = HashDataBitmap(*block_of_bools->data()); - std::shared_ptr negated_block_of_bools; - { - ASSERT_OK(builder.AppendValues(2, false)); - ASSERT_OK(builder.AppendValues(3, true)); - ASSERT_OK(builder.AppendValues(5, false)); - ASSERT_OK(builder.AppendValues(7, true)); - ASSERT_OK(builder.AppendValues(11, false)); - ASSERT_OK(builder.AppendValues(13, true)); - ASSERT_OK(builder.AppendValues(17, false)); - ASSERT_OK(builder.AppendValues(5, true)); - ASSERT_OK(builder.AppendValues(1, false)); - ASSERT_OK(builder.Finish(&negated_block_of_bools)); - ASSERT_EQ(negated_block_of_bools->length(), 64); - } - const auto hash_of_negated_block = HashDataBitmap(*negated_block_of_bools->data()); - - constexpr bool kSlowTests = false; - constexpr auto kMaxPadding = 64 + 32 + 1; - auto step = [&](int& i) { - const auto kStep = kSlowTests ? 1 : 8; - if (i + kStep >= kMaxPadding && i != kMaxPadding - 1) { - i = kMaxPadding - 1; - } else { - i += kStep; - } - }; - - for (int start_bits = 0; start_bits < 8; ++start_bits) { - for (int prefix_pad_len = 0; prefix_pad_len < kMaxPadding; step(prefix_pad_len)) { - auto prefix_pad = BuildBooleanArray(prefix_pad_len, start_bits & 0x1); - for (int suffix_pad_len = 0; suffix_pad_len < kMaxPadding; step(suffix_pad_len)) { - auto suffix_pad = BuildBooleanArray(suffix_pad_len, (start_bits >> 1) & 0x1); - - // A block of 64 bools in the middle - auto hash = HashConcatenation({prefix_pad, block_of_bools, suffix_pad}, - prefix_pad_len, 64); - ASSERT_EQ(hash, hash_of_block); - // Negated - hash = HashConcatenation({prefix_pad, negated_block_of_bools, suffix_pad}, - prefix_pad_len, 64); - ASSERT_EQ(hash, hash_of_negated_block); - - std::shared_ptr bools; - // Trailing bits and leading bits around a block - for (int trailing_len = 1; trailing_len < kMaxPadding; step(trailing_len)) { - auto trailing = BuildBooleanArray(trailing_len, (start_bits >> 1) & 0x1); - auto expected_hash = HashConcatenation({block_of_bools, trailing}); - auto hash = - HashConcatenation({prefix_pad, block_of_bools, trailing, suffix_pad}, - prefix_pad_len, 64 + trailing_len); - ASSERT_EQ(hash, expected_hash); - // Negated - expected_hash = HashConcatenation({negated_block_of_bools, trailing}); - hash = HashConcatenation( - {prefix_pad, negated_block_of_bools, trailing, suffix_pad}, prefix_pad_len, - 64 + trailing_len); - ASSERT_EQ(hash, expected_hash); - - // Use the trailing bits as leading bits now - auto leading = trailing; - auto leading_len = trailing_len; - expected_hash = HashConcatenation({leading, block_of_bools}); - hash = HashConcatenation({prefix_pad, leading, block_of_bools, suffix_pad}, - prefix_pad_len, leading_len + 64); - ASSERT_EQ(hash, expected_hash); - // Negated - expected_hash = HashConcatenation({leading, negated_block_of_bools}); - hash = - HashConcatenation({prefix_pad, leading, negated_block_of_bools, suffix_pad}, - prefix_pad_len, leading_len + 64); - ASSERT_EQ(hash, expected_hash); - - // leading and trailing at the same time - expected_hash = HashConcatenation({leading, block_of_bools, trailing}); - hash = HashConcatenation( - {prefix_pad, leading, block_of_bools, trailing, suffix_pad}, prefix_pad_len, - leading_len + 64 + trailing_len); - ASSERT_EQ(hash, expected_hash); - // Negated - expected_hash = HashConcatenation({leading, negated_block_of_bools, trailing}); - hash = HashConcatenation( - {prefix_pad, leading, negated_block_of_bools, trailing, suffix_pad}, - prefix_pad_len, leading_len + 64 + trailing_len); - ASSERT_EQ(hash, expected_hash); - } + const auto kStep = 9; + constexpr auto kMaxPadding = 64 + 32 + kStep; + + for (int prefix_pad_len = 0; prefix_pad_len < kMaxPadding; prefix_pad_len += kStep) { + auto prefix_pad = BuildBooleanArray(prefix_pad_len, true); + for (int suffix_pad_len = 0; suffix_pad_len < kMaxPadding; suffix_pad_len += kStep) { + auto suffix_pad = BuildBooleanArray(suffix_pad_len, true); + + // A block of 64 bools in the middle + auto hash = + HashConcatenation({prefix_pad, block_of_bools, suffix_pad}, prefix_pad_len, 64); + ASSERT_EQ(hash, hash_of_block); + + // Trailing bits and leading bits around a block + for (int trailing_len = 1; trailing_len < kMaxPadding; trailing_len += kStep) { + auto trailing = BuildBooleanArray(trailing_len, true); + auto expected_hash = HashConcatenation({block_of_bools, trailing}); + auto hash = HashConcatenation({prefix_pad, block_of_bools, trailing, suffix_pad}, + prefix_pad_len, 64 + trailing_len); + ASSERT_EQ(hash, expected_hash); + + // Use the trailing bits as leading bits now + auto leading = trailing; + auto leading_len = trailing_len; + expected_hash = HashConcatenation({leading, block_of_bools}); + hash = HashConcatenation({prefix_pad, leading, block_of_bools, suffix_pad}, + prefix_pad_len, leading_len + 64); + ASSERT_EQ(hash, expected_hash); + + // Leading and trailing at the same time + expected_hash = HashConcatenation({leading, block_of_bools, trailing}); + hash = + HashConcatenation({prefix_pad, leading, block_of_bools, trailing, suffix_pad}, + prefix_pad_len, leading_len + 64 + trailing_len); + ASSERT_EQ(hash, expected_hash); } } - if (!kSlowTests) { - break; - } } } From 6b29a350909a3bf8a2dc5e2713eff4e4868d0c15 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 31 May 2023 14:27:05 -0300 Subject: [PATCH 11/15] Checks that hashing is selective Co-authored-by: Antoine Pitrou --- cpp/src/arrow/scalar_test.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index 089b25265ebe..2429d610c282 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -1128,9 +1128,11 @@ class TestListScalar : public ::testing::Test { auto a = ArrayFromJSON(list_struct_type, R"([[{"a": 5}, {"a": 6}], [{"a": 7}, null]])"); auto b = ArrayFromJSON(list_struct_type, R"([[{"a": 7}, null]])"); + EXPECT_OK_AND_ASSIGN(auto a0, a->GetScalar(1)); EXPECT_OK_AND_ASSIGN(auto a1, a->GetScalar(1)); EXPECT_OK_AND_ASSIGN(auto b0, b->GetScalar(0)); ASSERT_EQ(a1->hash(), b0->hash()); + ASSERT_NE(a0->hash(), b0->hash()); } protected: From 1f852e3d9da0d32dfb36fd1f7c94c6cd29877643 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 31 May 2023 14:23:03 -0300 Subject: [PATCH 12/15] Speed-up the tests --- cpp/src/arrow/util/hashing_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/util/hashing_test.cc b/cpp/src/arrow/util/hashing_test.cc index 8a0be3ba342b..857a17c12fe7 100644 --- a/cpp/src/arrow/util/hashing_test.cc +++ b/cpp/src/arrow/util/hashing_test.cc @@ -560,7 +560,7 @@ TEST(BitmapHashTest, LongerInputs) { } const auto hash_of_block = HashDataBitmap(*block_of_bools->data()); - const auto kStep = 9; + const auto kStep = 13; constexpr auto kMaxPadding = 64 + 32 + kStep; for (int prefix_pad_len = 0; prefix_pad_len < kMaxPadding; prefix_pad_len += kStep) { From 9ee3d1fc4ef558aac3bd96099697e55370851029 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 31 May 2023 14:24:37 -0300 Subject: [PATCH 13/15] Removing redundant length parameter --- cpp/src/arrow/scalar.cc | 3 +-- cpp/src/arrow/util/hashing.cc | 5 ++--- cpp/src/arrow/util/hashing.h | 5 ++--- cpp/src/arrow/util/hashing_test.cc | 2 +- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 3d32895655bc..0537ddafe296 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -155,7 +155,6 @@ struct ScalarHashImpl { Status ArrayHash(const ArraySpan& a, int64_t offset, int64_t length) { // Calculate null count within the range const auto* validity = a.buffers[0].data; - const int64_t validity_size = a.buffers[0].size; int64_t null_count = 0; if (validity != NULLPTR) { if (offset == a.offset && length == a.length) { @@ -170,7 +169,7 @@ struct ScalarHashImpl { // We can't visit values without unboxing the whole array, so only hash // the null bitmap for now. Only hash the null bitmap if the null count // is not 0 to ensure hash consistency. - hash_ = internal::ComputeBitmapHash(validity, validity_size, /*seed=*/hash_, + hash_ = internal::ComputeBitmapHash(validity, /*seed=*/hash_, /*bits_offset=*/offset, /*num_bits=*/length); } diff --git a/cpp/src/arrow/util/hashing.cc b/cpp/src/arrow/util/hashing.cc index e2720c99c0e8..e64f4ea8172e 100644 --- a/cpp/src/arrow/util/hashing.cc +++ b/cpp/src/arrow/util/hashing.cc @@ -82,11 +82,10 @@ uint64_t MurmurHashBitmap64(const uint8_t* key, uint64_t seed, uint64_t bits_off } // namespace -hash_t ComputeBitmapHash(const uint8_t* bitmap, int64_t length, hash_t seed, - int64_t bits_offset, int64_t num_bits) { +hash_t ComputeBitmapHash(const uint8_t* bitmap, hash_t seed, int64_t bits_offset, + int64_t num_bits) { DCHECK_GE(bits_offset, 0); DCHECK_GE(num_bits, 0); - DCHECK_LE(bit_util::BytesForBits(bits_offset + num_bits), length); return MurmurHashBitmap64(bitmap, seed, bits_offset, num_bits); } diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h index dc79b4ff8008..2de9f4153248 100644 --- a/cpp/src/arrow/util/hashing.h +++ b/cpp/src/arrow/util/hashing.h @@ -71,14 +71,13 @@ inline hash_t ComputeStringHash(const void* data, int64_t length); /// /// \pre bits_offset >= 0 /// \pre num_bits >= 0 -/// \pre (bits_offset + num_bits + 7) / 8 <= length +/// \pre (bits_offset + num_bits + 7) / 8 <= readable length in bytes from bitmap /// /// \param bitmap The pointer to the bitmap. -/// \param length The length of the bitmap in bytes. /// \param seed The seed for the hash function (useful when chaining hash functions). /// \param bits_offset The offset in bits relative to the start of the bitmap. /// \param num_bits The number of bits after the offset to be hashed. -ARROW_EXPORT hash_t ComputeBitmapHash(const uint8_t* bitmap, int64_t length, hash_t seed, +ARROW_EXPORT hash_t ComputeBitmapHash(const uint8_t* bitmap, hash_t seed, int64_t bits_offset, int64_t num_bits); template diff --git a/cpp/src/arrow/util/hashing_test.cc b/cpp/src/arrow/util/hashing_test.cc index 857a17c12fe7..03975bdac970 100644 --- a/cpp/src/arrow/util/hashing_test.cc +++ b/cpp/src/arrow/util/hashing_test.cc @@ -491,7 +491,7 @@ TEST(BinaryMemoTable, Empty) { hash_t HashDataBitmap(const ArraySpan& array) { EXPECT_EQ(array.type->id(), Type::BOOL); const auto& bitmap = array.buffers[1]; - return ComputeBitmapHash(bitmap.data, bitmap.size, + return ComputeBitmapHash(bitmap.data, /*seed=*/0, /*bit_offset=*/array.offset, /*num_bits=*/array.length); From d2c4a07f4a3d25f0046377d85a4c5f804d4f152f Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Thu, 1 Jun 2023 00:42:29 -0300 Subject: [PATCH 14/15] fixup! Checks that hashing is selective --- cpp/src/arrow/scalar_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index 2429d610c282..1c7572a1c868 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -1128,7 +1128,7 @@ class TestListScalar : public ::testing::Test { auto a = ArrayFromJSON(list_struct_type, R"([[{"a": 5}, {"a": 6}], [{"a": 7}, null]])"); auto b = ArrayFromJSON(list_struct_type, R"([[{"a": 7}, null]])"); - EXPECT_OK_AND_ASSIGN(auto a0, a->GetScalar(1)); + EXPECT_OK_AND_ASSIGN(auto a0, a->GetScalar(0)); EXPECT_OK_AND_ASSIGN(auto a1, a->GetScalar(1)); EXPECT_OK_AND_ASSIGN(auto b0, b->GetScalar(0)); ASSERT_EQ(a1->hash(), b0->hash()); From 7e84649c0e1bec6d218389aa10ced55cbc3eeb2d Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 1 Jun 2023 17:53:11 +0200 Subject: [PATCH 15/15] Remove unused macro --- cpp/src/arrow/util/hashing.cc | 7 ------- 1 file changed, 7 deletions(-) diff --git a/cpp/src/arrow/util/hashing.cc b/cpp/src/arrow/util/hashing.cc index e64f4ea8172e..fe24ff82952b 100644 --- a/cpp/src/arrow/util/hashing.cc +++ b/cpp/src/arrow/util/hashing.cc @@ -25,13 +25,6 @@ namespace internal { namespace { -/// \brief Rotate-right, 64-bit. -/// -/// This compiles to a single instruction on CPUs that have rotation instructions. -/// -/// \pre n must be in the range [1, 64]. -#define ROTR64(v, n) (((v) >> (n)) | ((v) << (64 - (n)))) - /// \brief A hash function for bitmaps that can handle offsets and lengths in /// terms of number of bits. The hash only depends on the bits actually hashed. ///