Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ set(ARROW_SRCS
util/delimiting.cc
util/formatting.cc
util/future.cc
util/hashing.cc
util/int_util.cc
util/io_util.cc
util/logging.cc
Expand Down
43 changes: 37 additions & 6 deletions cpp/src/arrow/scalar.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include "arrow/compare.h"
#include "arrow/pretty_print.h"
#include "arrow/type.h"
#include "arrow/util/bitmap_ops.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/decimal.h"
#include "arrow/util/formatting.h"
Expand Down Expand Up @@ -151,20 +152,50 @@ struct ScalarHashImpl {

Status ArrayHash(const Array& a) { return ArrayHash(*a.data()); }

Status ArrayHash(const ArrayData& a) {
RETURN_NOT_OK(StdHash(a.length) & StdHash(a.GetNullCount()));
if (a.GetNullCount() != 0 && a.buffers[0] != nullptr) {
Status ArrayHash(const ArraySpan& a, int64_t offset, int64_t length) {
// Calculate null count within the range
const auto* validity = a.buffers[0].data;
int64_t null_count = 0;
if (validity != NULLPTR) {
if (offset == a.offset && length == a.length) {
null_count = a.GetNullCount();
} else {
null_count = length - internal::CountSetBits(validity, offset, length);
}
}

RETURN_NOT_OK(StdHash(length) & StdHash(null_count));
if (null_count != 0) {
// We can't visit values without unboxing the whole array, so only hash
// the null bitmap for now. Only hash the null bitmap if the null count
// is not 0 to ensure hash consistency.
RETURN_NOT_OK(BufferHash(*a.buffers[0]));
hash_ = internal::ComputeBitmapHash(validity, /*seed=*/hash_,
/*bits_offset=*/offset, /*num_bits=*/length);
}
for (const auto& child : a.child_data) {
RETURN_NOT_OK(ArrayHash(*child));

// Hash the relevant child arrays for each type taking offset and length
// from the parent array into account if necessary.
switch (a.type->id()) {
case Type::STRUCT:
for (const auto& child : a.child_data) {
RETURN_NOT_OK(ArrayHash(child, offset, length));
}
break;
// TODO(GH-35830): Investigate what should be the correct behavior for
// each nested type.
default:
// By default, just hash the arrays without considering
// the offset and length of the parent.
for (const auto& child : a.child_data) {
RETURN_NOT_OK(ArrayHash(child));
}
break;
}
return Status::OK();
}

Status ArrayHash(const ArraySpan& a) { return ArrayHash(a, a.offset, a.length); }

explicit ScalarHashImpl(const Scalar& scalar) : hash_(scalar.type->Hash()) {
AccumulateHashFrom(scalar);
}
Expand Down
12 changes: 12 additions & 0 deletions cpp/src/arrow/scalar_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1121,6 +1121,18 @@ class TestListScalar : public ::testing::Test {
ASSERT_NE(set_bitmap_scalar->value->data()->buffers[0], nullptr);
// ... yet it's hashing equal to the other scalar
ASSERT_EQ(empty_bitmap_scalar.hash(), set_bitmap_scalar->hash());

// GH-35360: the hash value of a scalar from a list of structs should
// pay attention to the offset so it hashes the equivalent validity bitmap
auto list_struct_type = list(struct_({field("a", int64())}));
auto a =
ArrayFromJSON(list_struct_type, R"([[{"a": 5}, {"a": 6}], [{"a": 7}, null]])");
auto b = ArrayFromJSON(list_struct_type, R"([[{"a": 7}, null]])");
EXPECT_OK_AND_ASSIGN(auto a0, a->GetScalar(0));
EXPECT_OK_AND_ASSIGN(auto a1, a->GetScalar(1));
EXPECT_OK_AND_ASSIGN(auto b0, b->GetScalar(0));
ASSERT_EQ(a1->hash(), b0->hash());
Comment thread
felipecrv marked this conversation as resolved.
ASSERT_NE(a0->hash(), b0->hash());
}

protected:
Expand Down
86 changes: 86 additions & 0 deletions cpp/src/arrow/util/hashing.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/util/hashing.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/bitmap_reader.h"
#include "arrow/util/macros.h"

namespace arrow {
namespace internal {

namespace {

/// \brief A hash function for bitmaps that can handle offsets and lengths in
/// terms of number of bits. The hash only depends on the bits actually hashed.
///
/// This implementation is based on 64-bit versions of MurmurHash2 by Austin Appleby.
///
/// It's the caller's responsibility to ensure that bits_offset + num_bits are
/// readable from the bitmap.
///
/// \param key The pointer to the bitmap.
/// \param seed The seed for the hash function (useful when chaining hash functions).
/// \param bits_offset The offset in bits relative to the start of the bitmap.
/// \param num_bits The number of bits after the offset to be hashed.
uint64_t MurmurHashBitmap64(const uint8_t* key, uint64_t seed, uint64_t bits_offset,
uint64_t num_bits) {
const uint64_t m = 0xc6a4a7935bd1e995LLU;
const int r = 47;

uint64_t h = seed ^ (num_bits * m);

BitmapWordReader<uint64_t> reader(key, bits_offset, num_bits);
auto nwords = reader.words();
while (nwords--) {
auto k = reader.NextWord();
k *= m;
k ^= k >> r;
k *= m;

h ^= k;
h *= m;
}
int valid_bits;
auto nbytes = reader.trailing_bytes();
if (nbytes) {
uint64_t k = 0;
do {
auto byte = reader.NextTrailingByte(valid_bits);
k = (k << 8) | static_cast<uint64_t>(byte);
} while (--nbytes);
h ^= k;
h *= m;
}

h ^= h >> r;
h *= m;
h ^= h >> r;
return h;
}

} // namespace

hash_t ComputeBitmapHash(const uint8_t* bitmap, hash_t seed, int64_t bits_offset,
int64_t num_bits) {
DCHECK_GE(bits_offset, 0);
DCHECK_GE(num_bits, 0);
return MurmurHashBitmap64(bitmap, seed, bits_offset, num_bits);
}

} // namespace internal
} // namespace arrow
17 changes: 17 additions & 0 deletions cpp/src/arrow/util/hashing.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,23 @@ typedef uint64_t hash_t;
template <uint64_t AlgNum>
inline hash_t ComputeStringHash(const void* data, int64_t length);

/// \brief A hash function for bitmaps that can handle offsets and lengths in
/// terms of number of bits. The hash only depends on the bits actually hashed.
///
/// It's the caller's responsibility to ensure that bits_offset + num_bits are
/// readable from the bitmap.
///
/// \pre bits_offset >= 0
/// \pre num_bits >= 0
/// \pre (bits_offset + num_bits + 7) / 8 <= readable length in bytes from bitmap
///
/// \param bitmap The pointer to the bitmap.
/// \param seed The seed for the hash function (useful when chaining hash functions).
/// \param bits_offset The offset in bits relative to the start of the bitmap.
/// \param num_bits The number of bits after the offset to be hashed.
ARROW_EXPORT hash_t ComputeBitmapHash(const uint8_t* bitmap, hash_t seed,
int64_t bits_offset, int64_t num_bits);

template <typename Scalar, uint64_t AlgNum>
struct ScalarHelperBase {
static bool CompareScalars(Scalar u, Scalar v) { return u == v; }
Expand Down
114 changes: 114 additions & 0 deletions cpp/src/arrow/util/hashing_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
#include <gmock/gmock.h>
#include <gtest/gtest.h>

#include "arrow/array/builder_primitive.h"
#include "arrow/array/concatenate.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/util/bit_util.h"
#include "arrow/util/hashing.h"
Expand Down Expand Up @@ -486,5 +488,117 @@ TEST(BinaryMemoTable, Empty) {
EXPECT_EQ(offsets[0], 0);
}

hash_t HashDataBitmap(const ArraySpan& array) {
EXPECT_EQ(array.type->id(), Type::BOOL);
const auto& bitmap = array.buffers[1];
return ComputeBitmapHash(bitmap.data,
/*seed=*/0,
/*bit_offset=*/array.offset,
/*num_bits=*/array.length);
}

std::shared_ptr<BooleanArray> BuildBooleanArray(int len, bool start) {
// This could be memoized in the future to speed up tests.
BooleanBuilder builder;
for (int i = 0; i < len; ++i) {
EXPECT_TRUE(builder.Append(((i % 2 != 0) ^ start) == 1).ok());
}
std::shared_ptr<BooleanArray> array;
EXPECT_TRUE(builder.Finish(&array).ok());
return array;
}

hash_t HashConcatenation(const ArrayVector& arrays, int64_t bits_offset = -1,
int64_t num_bits = -1) {
EXPECT_OK_AND_ASSIGN(auto concat, Concatenate(arrays));
EXPECT_EQ(concat->type()->id(), Type::BOOL);
if (bits_offset == -1 || num_bits == -1) {
return HashDataBitmap(*concat->data());
}
auto slice = concat->Slice(bits_offset, num_bits);
return HashDataBitmap(*slice->data());
}

TEST(BitmapHashTest, SmallInputs) {
for (bool start : {false, true}) {
auto block = BuildBooleanArray(64, start);
for (int len = 0; len < 64; len++) {
auto prefix = BuildBooleanArray(len, start);
auto expected_hash = HashDataBitmap(*prefix->data());

auto slice = block->Slice(0, len);
auto slice_hash = HashDataBitmap(*slice->data());
ASSERT_EQ(expected_hash, slice_hash);

for (int j = 1; j < len; j++) {
auto fragment = BuildBooleanArray(len - j, start ^ (j % 2 != 0));
expected_hash = HashDataBitmap(*fragment->data());

slice = block->Slice(j, len - j);
slice_hash = HashDataBitmap(*slice->data());
ASSERT_EQ(expected_hash, slice_hash);
}
}
}
}

TEST(BitmapHashTest, LongerInputs) {
BooleanBuilder builder;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test looks a bit complicated. Would it simplify to:

  • generate random boolean arrays
  • slice them and check hashing the slice vs. hashing an aligned copy of the buffer?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When one of these tests fail, it's easier to see which part of the function is not handling input correctly. Random arrays might not necessarily cover all the range checks.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And the 2,3,5... block was easy to spot in the debugger to see what was going on by printing the words in binary.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As you prefer, though it would be nice to find a way to make the test more compact and easier to read.

std::shared_ptr<BooleanArray> block_of_bools;
{
ASSERT_OK(builder.AppendValues(2, true));
ASSERT_OK(builder.AppendValues(3, false));
ASSERT_OK(builder.AppendValues(5, true));
ASSERT_OK(builder.AppendValues(7, false));
ASSERT_OK(builder.AppendValues(11, true));
ASSERT_OK(builder.AppendValues(13, false));
ASSERT_OK(builder.AppendValues(17, true));
ASSERT_OK(builder.AppendValues(5, false));
ASSERT_OK(builder.AppendValues(1, true));
ASSERT_OK(builder.Finish(&block_of_bools));
ASSERT_EQ(block_of_bools->length(), 64);
}
const auto hash_of_block = HashDataBitmap(*block_of_bools->data());

const auto kStep = 13;
constexpr auto kMaxPadding = 64 + 32 + kStep;

for (int prefix_pad_len = 0; prefix_pad_len < kMaxPadding; prefix_pad_len += kStep) {
auto prefix_pad = BuildBooleanArray(prefix_pad_len, true);
for (int suffix_pad_len = 0; suffix_pad_len < kMaxPadding; suffix_pad_len += kStep) {
auto suffix_pad = BuildBooleanArray(suffix_pad_len, true);

// A block of 64 bools in the middle
auto hash =
HashConcatenation({prefix_pad, block_of_bools, suffix_pad}, prefix_pad_len, 64);
ASSERT_EQ(hash, hash_of_block);

// Trailing bits and leading bits around a block
for (int trailing_len = 1; trailing_len < kMaxPadding; trailing_len += kStep) {
auto trailing = BuildBooleanArray(trailing_len, true);
auto expected_hash = HashConcatenation({block_of_bools, trailing});
auto hash = HashConcatenation({prefix_pad, block_of_bools, trailing, suffix_pad},
prefix_pad_len, 64 + trailing_len);
ASSERT_EQ(hash, expected_hash);

// Use the trailing bits as leading bits now
auto leading = trailing;
auto leading_len = trailing_len;
expected_hash = HashConcatenation({leading, block_of_bools});
hash = HashConcatenation({prefix_pad, leading, block_of_bools, suffix_pad},
prefix_pad_len, leading_len + 64);
ASSERT_EQ(hash, expected_hash);

// Leading and trailing at the same time
expected_hash = HashConcatenation({leading, block_of_bools, trailing});
hash =
HashConcatenation({prefix_pad, leading, block_of_bools, trailing, suffix_pad},
prefix_pad_len, leading_len + 64 + trailing_len);
ASSERT_EQ(hash, expected_hash);
}
}
}
}

} // namespace internal
} // namespace arrow
9 changes: 9 additions & 0 deletions python/pyarrow/tests/test_scalars.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,15 @@ def test_hashing():
assert len(set_from_array) == 500


def test_hashing_struct_scalar():
Comment thread
felipecrv marked this conversation as resolved.
# GH-35360
a = pa.array([[{'a': 5}, {'a': 6}], [{'a': 7}, None]])
b = pa.array([[{'a': 7}, None]])
hash1 = hash(a[1])
hash2 = hash(b[0])
assert hash1 == hash2


def test_bool():
false = pa.scalar(False)
true = pa.scalar(True)
Expand Down