diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index d928cdf58b11..84539755489b 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -216,6 +216,7 @@ set(ARROW_SRCS util/delimiting.cc util/formatting.cc util/future.cc + util/hashing.cc util/int_util.cc util/io_util.cc util/logging.cc diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc index 1e40266b3cae..0537ddafe296 100644 --- a/cpp/src/arrow/scalar.cc +++ b/cpp/src/arrow/scalar.cc @@ -29,6 +29,7 @@ #include "arrow/compare.h" #include "arrow/pretty_print.h" #include "arrow/type.h" +#include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" #include "arrow/util/decimal.h" #include "arrow/util/formatting.h" @@ -151,20 +152,50 @@ struct ScalarHashImpl { Status ArrayHash(const Array& a) { return ArrayHash(*a.data()); } - Status ArrayHash(const ArrayData& a) { - RETURN_NOT_OK(StdHash(a.length) & StdHash(a.GetNullCount())); - if (a.GetNullCount() != 0 && a.buffers[0] != nullptr) { + Status ArrayHash(const ArraySpan& a, int64_t offset, int64_t length) { + // Calculate null count within the range + const auto* validity = a.buffers[0].data; + int64_t null_count = 0; + if (validity != NULLPTR) { + if (offset == a.offset && length == a.length) { + null_count = a.GetNullCount(); + } else { + null_count = length - internal::CountSetBits(validity, offset, length); + } + } + + RETURN_NOT_OK(StdHash(length) & StdHash(null_count)); + if (null_count != 0) { // We can't visit values without unboxing the whole array, so only hash // the null bitmap for now. Only hash the null bitmap if the null count // is not 0 to ensure hash consistency. - RETURN_NOT_OK(BufferHash(*a.buffers[0])); + hash_ = internal::ComputeBitmapHash(validity, /*seed=*/hash_, + /*bits_offset=*/offset, /*num_bits=*/length); } - for (const auto& child : a.child_data) { - RETURN_NOT_OK(ArrayHash(*child)); + + // Hash the relevant child arrays for each type taking offset and length + // from the parent array into account if necessary. + switch (a.type->id()) { + case Type::STRUCT: + for (const auto& child : a.child_data) { + RETURN_NOT_OK(ArrayHash(child, offset, length)); + } + break; + // TODO(GH-35830): Investigate what should be the correct behavior for + // each nested type. + default: + // By default, just hash the arrays without considering + // the offset and length of the parent. + for (const auto& child : a.child_data) { + RETURN_NOT_OK(ArrayHash(child)); + } + break; } return Status::OK(); } + Status ArrayHash(const ArraySpan& a) { return ArrayHash(a, a.offset, a.length); } + explicit ScalarHashImpl(const Scalar& scalar) : hash_(scalar.type->Hash()) { AccumulateHashFrom(scalar); } diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc index 5d74eafcfcc0..1c7572a1c868 100644 --- a/cpp/src/arrow/scalar_test.cc +++ b/cpp/src/arrow/scalar_test.cc @@ -1121,6 +1121,18 @@ class TestListScalar : public ::testing::Test { ASSERT_NE(set_bitmap_scalar->value->data()->buffers[0], nullptr); // ... yet it's hashing equal to the other scalar ASSERT_EQ(empty_bitmap_scalar.hash(), set_bitmap_scalar->hash()); + + // GH-35360: the hash value of a scalar from a list of structs should + // pay attention to the offset so it hashes the equivalent validity bitmap + auto list_struct_type = list(struct_({field("a", int64())})); + auto a = + ArrayFromJSON(list_struct_type, R"([[{"a": 5}, {"a": 6}], [{"a": 7}, null]])"); + auto b = ArrayFromJSON(list_struct_type, R"([[{"a": 7}, null]])"); + EXPECT_OK_AND_ASSIGN(auto a0, a->GetScalar(0)); + EXPECT_OK_AND_ASSIGN(auto a1, a->GetScalar(1)); + EXPECT_OK_AND_ASSIGN(auto b0, b->GetScalar(0)); + ASSERT_EQ(a1->hash(), b0->hash()); + ASSERT_NE(a0->hash(), b0->hash()); } protected: diff --git a/cpp/src/arrow/util/hashing.cc b/cpp/src/arrow/util/hashing.cc new file mode 100644 index 000000000000..fe24ff82952b --- /dev/null +++ b/cpp/src/arrow/util/hashing.cc @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/util/hashing.h" +#include "arrow/util/bit_util.h" +#include "arrow/util/bitmap_reader.h" +#include "arrow/util/macros.h" + +namespace arrow { +namespace internal { + +namespace { + +/// \brief A hash function for bitmaps that can handle offsets and lengths in +/// terms of number of bits. The hash only depends on the bits actually hashed. +/// +/// This implementation is based on 64-bit versions of MurmurHash2 by Austin Appleby. +/// +/// It's the caller's responsibility to ensure that bits_offset + num_bits are +/// readable from the bitmap. +/// +/// \param key The pointer to the bitmap. +/// \param seed The seed for the hash function (useful when chaining hash functions). +/// \param bits_offset The offset in bits relative to the start of the bitmap. +/// \param num_bits The number of bits after the offset to be hashed. +uint64_t MurmurHashBitmap64(const uint8_t* key, uint64_t seed, uint64_t bits_offset, + uint64_t num_bits) { + const uint64_t m = 0xc6a4a7935bd1e995LLU; + const int r = 47; + + uint64_t h = seed ^ (num_bits * m); + + BitmapWordReader reader(key, bits_offset, num_bits); + auto nwords = reader.words(); + while (nwords--) { + auto k = reader.NextWord(); + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + int valid_bits; + auto nbytes = reader.trailing_bytes(); + if (nbytes) { + uint64_t k = 0; + do { + auto byte = reader.NextTrailingByte(valid_bits); + k = (k << 8) | static_cast(byte); + } while (--nbytes); + h ^= k; + h *= m; + } + + h ^= h >> r; + h *= m; + h ^= h >> r; + return h; +} + +} // namespace + +hash_t ComputeBitmapHash(const uint8_t* bitmap, hash_t seed, int64_t bits_offset, + int64_t num_bits) { + DCHECK_GE(bits_offset, 0); + DCHECK_GE(num_bits, 0); + return MurmurHashBitmap64(bitmap, seed, bits_offset, num_bits); +} + +} // namespace internal +} // namespace arrow diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h index 6656492b3f09..2de9f4153248 100644 --- a/cpp/src/arrow/util/hashing.h +++ b/cpp/src/arrow/util/hashing.h @@ -63,6 +63,23 @@ typedef uint64_t hash_t; template inline hash_t ComputeStringHash(const void* data, int64_t length); +/// \brief A hash function for bitmaps that can handle offsets and lengths in +/// terms of number of bits. The hash only depends on the bits actually hashed. +/// +/// It's the caller's responsibility to ensure that bits_offset + num_bits are +/// readable from the bitmap. +/// +/// \pre bits_offset >= 0 +/// \pre num_bits >= 0 +/// \pre (bits_offset + num_bits + 7) / 8 <= readable length in bytes from bitmap +/// +/// \param bitmap The pointer to the bitmap. +/// \param seed The seed for the hash function (useful when chaining hash functions). +/// \param bits_offset The offset in bits relative to the start of the bitmap. +/// \param num_bits The number of bits after the offset to be hashed. +ARROW_EXPORT hash_t ComputeBitmapHash(const uint8_t* bitmap, hash_t seed, + int64_t bits_offset, int64_t num_bits); + template struct ScalarHelperBase { static bool CompareScalars(Scalar u, Scalar v) { return u == v; } diff --git a/cpp/src/arrow/util/hashing_test.cc b/cpp/src/arrow/util/hashing_test.cc index ac212e9fb4fa..03975bdac970 100644 --- a/cpp/src/arrow/util/hashing_test.cc +++ b/cpp/src/arrow/util/hashing_test.cc @@ -28,6 +28,8 @@ #include #include +#include "arrow/array/builder_primitive.h" +#include "arrow/array/concatenate.h" #include "arrow/testing/gtest_util.h" #include "arrow/util/bit_util.h" #include "arrow/util/hashing.h" @@ -486,5 +488,117 @@ TEST(BinaryMemoTable, Empty) { EXPECT_EQ(offsets[0], 0); } +hash_t HashDataBitmap(const ArraySpan& array) { + EXPECT_EQ(array.type->id(), Type::BOOL); + const auto& bitmap = array.buffers[1]; + return ComputeBitmapHash(bitmap.data, + /*seed=*/0, + /*bit_offset=*/array.offset, + /*num_bits=*/array.length); +} + +std::shared_ptr BuildBooleanArray(int len, bool start) { + // This could be memoized in the future to speed up tests. + BooleanBuilder builder; + for (int i = 0; i < len; ++i) { + EXPECT_TRUE(builder.Append(((i % 2 != 0) ^ start) == 1).ok()); + } + std::shared_ptr array; + EXPECT_TRUE(builder.Finish(&array).ok()); + return array; +} + +hash_t HashConcatenation(const ArrayVector& arrays, int64_t bits_offset = -1, + int64_t num_bits = -1) { + EXPECT_OK_AND_ASSIGN(auto concat, Concatenate(arrays)); + EXPECT_EQ(concat->type()->id(), Type::BOOL); + if (bits_offset == -1 || num_bits == -1) { + return HashDataBitmap(*concat->data()); + } + auto slice = concat->Slice(bits_offset, num_bits); + return HashDataBitmap(*slice->data()); +} + +TEST(BitmapHashTest, SmallInputs) { + for (bool start : {false, true}) { + auto block = BuildBooleanArray(64, start); + for (int len = 0; len < 64; len++) { + auto prefix = BuildBooleanArray(len, start); + auto expected_hash = HashDataBitmap(*prefix->data()); + + auto slice = block->Slice(0, len); + auto slice_hash = HashDataBitmap(*slice->data()); + ASSERT_EQ(expected_hash, slice_hash); + + for (int j = 1; j < len; j++) { + auto fragment = BuildBooleanArray(len - j, start ^ (j % 2 != 0)); + expected_hash = HashDataBitmap(*fragment->data()); + + slice = block->Slice(j, len - j); + slice_hash = HashDataBitmap(*slice->data()); + ASSERT_EQ(expected_hash, slice_hash); + } + } + } +} + +TEST(BitmapHashTest, LongerInputs) { + BooleanBuilder builder; + std::shared_ptr block_of_bools; + { + ASSERT_OK(builder.AppendValues(2, true)); + ASSERT_OK(builder.AppendValues(3, false)); + ASSERT_OK(builder.AppendValues(5, true)); + ASSERT_OK(builder.AppendValues(7, false)); + ASSERT_OK(builder.AppendValues(11, true)); + ASSERT_OK(builder.AppendValues(13, false)); + ASSERT_OK(builder.AppendValues(17, true)); + ASSERT_OK(builder.AppendValues(5, false)); + ASSERT_OK(builder.AppendValues(1, true)); + ASSERT_OK(builder.Finish(&block_of_bools)); + ASSERT_EQ(block_of_bools->length(), 64); + } + const auto hash_of_block = HashDataBitmap(*block_of_bools->data()); + + const auto kStep = 13; + constexpr auto kMaxPadding = 64 + 32 + kStep; + + for (int prefix_pad_len = 0; prefix_pad_len < kMaxPadding; prefix_pad_len += kStep) { + auto prefix_pad = BuildBooleanArray(prefix_pad_len, true); + for (int suffix_pad_len = 0; suffix_pad_len < kMaxPadding; suffix_pad_len += kStep) { + auto suffix_pad = BuildBooleanArray(suffix_pad_len, true); + + // A block of 64 bools in the middle + auto hash = + HashConcatenation({prefix_pad, block_of_bools, suffix_pad}, prefix_pad_len, 64); + ASSERT_EQ(hash, hash_of_block); + + // Trailing bits and leading bits around a block + for (int trailing_len = 1; trailing_len < kMaxPadding; trailing_len += kStep) { + auto trailing = BuildBooleanArray(trailing_len, true); + auto expected_hash = HashConcatenation({block_of_bools, trailing}); + auto hash = HashConcatenation({prefix_pad, block_of_bools, trailing, suffix_pad}, + prefix_pad_len, 64 + trailing_len); + ASSERT_EQ(hash, expected_hash); + + // Use the trailing bits as leading bits now + auto leading = trailing; + auto leading_len = trailing_len; + expected_hash = HashConcatenation({leading, block_of_bools}); + hash = HashConcatenation({prefix_pad, leading, block_of_bools, suffix_pad}, + prefix_pad_len, leading_len + 64); + ASSERT_EQ(hash, expected_hash); + + // Leading and trailing at the same time + expected_hash = HashConcatenation({leading, block_of_bools, trailing}); + hash = + HashConcatenation({prefix_pad, leading, block_of_bools, trailing, suffix_pad}, + prefix_pad_len, leading_len + 64 + trailing_len); + ASSERT_EQ(hash, expected_hash); + } + } + } +} + } // namespace internal } // namespace arrow diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index ca2d29e5dac2..1e6a3f29e0d7 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -143,6 +143,15 @@ def test_hashing(): assert len(set_from_array) == 500 +def test_hashing_struct_scalar(): + # GH-35360 + a = pa.array([[{'a': 5}, {'a': 6}], [{'a': 7}, None]]) + b = pa.array([[{'a': 7}, None]]) + hash1 = hash(a[1]) + hash2 = hash(b[0]) + assert hash1 == hash2 + + def test_bool(): false = pa.scalar(False) true = pa.scalar(True)