-
Notifications
You must be signed in to change notification settings - Fork 4.1k
GH-35360: [C++] Take offset into account in ScalarHashImpl::ArrayHash() #35814
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
29be7f2
9f5758f
20626cf
b18c09e
b6d6671
b7c9216
b797450
0e7546c
bee4183
1ea600f
6b29a35
1f852e3
9ee3d1f
d2c4a07
7e84649
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,86 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| #include "arrow/util/hashing.h" | ||
| #include "arrow/util/bit_util.h" | ||
| #include "arrow/util/bitmap_reader.h" | ||
| #include "arrow/util/macros.h" | ||
|
|
||
| namespace arrow { | ||
| namespace internal { | ||
|
|
||
| namespace { | ||
|
|
||
| /// \brief A hash function for bitmaps that can handle offsets and lengths in | ||
| /// terms of number of bits. The hash only depends on the bits actually hashed. | ||
| /// | ||
| /// This implementation is based on 64-bit versions of MurmurHash2 by Austin Appleby. | ||
| /// | ||
| /// It's the caller's responsibility to ensure that bits_offset + num_bits are | ||
| /// readable from the bitmap. | ||
| /// | ||
| /// \param key The pointer to the bitmap. | ||
| /// \param seed The seed for the hash function (useful when chaining hash functions). | ||
| /// \param bits_offset The offset in bits relative to the start of the bitmap. | ||
| /// \param num_bits The number of bits after the offset to be hashed. | ||
| uint64_t MurmurHashBitmap64(const uint8_t* key, uint64_t seed, uint64_t bits_offset, | ||
| uint64_t num_bits) { | ||
| const uint64_t m = 0xc6a4a7935bd1e995LLU; | ||
| const int r = 47; | ||
|
|
||
| uint64_t h = seed ^ (num_bits * m); | ||
|
|
||
| BitmapWordReader<uint64_t> reader(key, bits_offset, num_bits); | ||
| auto nwords = reader.words(); | ||
| while (nwords--) { | ||
| auto k = reader.NextWord(); | ||
| k *= m; | ||
| k ^= k >> r; | ||
| k *= m; | ||
|
|
||
| h ^= k; | ||
| h *= m; | ||
| } | ||
| int valid_bits; | ||
| auto nbytes = reader.trailing_bytes(); | ||
| if (nbytes) { | ||
| uint64_t k = 0; | ||
| do { | ||
| auto byte = reader.NextTrailingByte(valid_bits); | ||
| k = (k << 8) | static_cast<uint64_t>(byte); | ||
| } while (--nbytes); | ||
| h ^= k; | ||
| h *= m; | ||
| } | ||
|
|
||
| h ^= h >> r; | ||
| h *= m; | ||
| h ^= h >> r; | ||
| return h; | ||
| } | ||
|
|
||
| } // namespace | ||
|
|
||
| hash_t ComputeBitmapHash(const uint8_t* bitmap, hash_t seed, int64_t bits_offset, | ||
| int64_t num_bits) { | ||
| DCHECK_GE(bits_offset, 0); | ||
| DCHECK_GE(num_bits, 0); | ||
| return MurmurHashBitmap64(bitmap, seed, bits_offset, num_bits); | ||
| } | ||
|
|
||
| } // namespace internal | ||
| } // namespace arrow |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,6 +28,8 @@ | |
| #include <gmock/gmock.h> | ||
| #include <gtest/gtest.h> | ||
|
|
||
| #include "arrow/array/builder_primitive.h" | ||
| #include "arrow/array/concatenate.h" | ||
| #include "arrow/testing/gtest_util.h" | ||
| #include "arrow/util/bit_util.h" | ||
| #include "arrow/util/hashing.h" | ||
|
|
@@ -486,5 +488,117 @@ TEST(BinaryMemoTable, Empty) { | |
| EXPECT_EQ(offsets[0], 0); | ||
| } | ||
|
|
||
| hash_t HashDataBitmap(const ArraySpan& array) { | ||
| EXPECT_EQ(array.type->id(), Type::BOOL); | ||
| const auto& bitmap = array.buffers[1]; | ||
| return ComputeBitmapHash(bitmap.data, | ||
| /*seed=*/0, | ||
| /*bit_offset=*/array.offset, | ||
| /*num_bits=*/array.length); | ||
| } | ||
|
|
||
| std::shared_ptr<BooleanArray> BuildBooleanArray(int len, bool start) { | ||
| // This could be memoized in the future to speed up tests. | ||
| BooleanBuilder builder; | ||
| for (int i = 0; i < len; ++i) { | ||
| EXPECT_TRUE(builder.Append(((i % 2 != 0) ^ start) == 1).ok()); | ||
| } | ||
| std::shared_ptr<BooleanArray> array; | ||
| EXPECT_TRUE(builder.Finish(&array).ok()); | ||
| return array; | ||
| } | ||
|
|
||
| hash_t HashConcatenation(const ArrayVector& arrays, int64_t bits_offset = -1, | ||
| int64_t num_bits = -1) { | ||
| EXPECT_OK_AND_ASSIGN(auto concat, Concatenate(arrays)); | ||
| EXPECT_EQ(concat->type()->id(), Type::BOOL); | ||
| if (bits_offset == -1 || num_bits == -1) { | ||
| return HashDataBitmap(*concat->data()); | ||
| } | ||
| auto slice = concat->Slice(bits_offset, num_bits); | ||
| return HashDataBitmap(*slice->data()); | ||
| } | ||
|
|
||
| TEST(BitmapHashTest, SmallInputs) { | ||
| for (bool start : {false, true}) { | ||
| auto block = BuildBooleanArray(64, start); | ||
| for (int len = 0; len < 64; len++) { | ||
| auto prefix = BuildBooleanArray(len, start); | ||
| auto expected_hash = HashDataBitmap(*prefix->data()); | ||
|
|
||
| auto slice = block->Slice(0, len); | ||
| auto slice_hash = HashDataBitmap(*slice->data()); | ||
| ASSERT_EQ(expected_hash, slice_hash); | ||
|
|
||
| for (int j = 1; j < len; j++) { | ||
| auto fragment = BuildBooleanArray(len - j, start ^ (j % 2 != 0)); | ||
| expected_hash = HashDataBitmap(*fragment->data()); | ||
|
|
||
| slice = block->Slice(j, len - j); | ||
| slice_hash = HashDataBitmap(*slice->data()); | ||
| ASSERT_EQ(expected_hash, slice_hash); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| TEST(BitmapHashTest, LongerInputs) { | ||
| BooleanBuilder builder; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test looks a bit complicated. Would it simplify to:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When one of these tests fail, it's easier to see which part of the function is not handling input correctly. Random arrays might not necessarily cover all the range checks.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And the 2,3,5... block was easy to spot in the debugger to see what was going on by printing the words in binary.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As you prefer, though it would be nice to find a way to make the test more compact and easier to read. |
||
| std::shared_ptr<BooleanArray> block_of_bools; | ||
| { | ||
| ASSERT_OK(builder.AppendValues(2, true)); | ||
| ASSERT_OK(builder.AppendValues(3, false)); | ||
| ASSERT_OK(builder.AppendValues(5, true)); | ||
| ASSERT_OK(builder.AppendValues(7, false)); | ||
| ASSERT_OK(builder.AppendValues(11, true)); | ||
| ASSERT_OK(builder.AppendValues(13, false)); | ||
| ASSERT_OK(builder.AppendValues(17, true)); | ||
| ASSERT_OK(builder.AppendValues(5, false)); | ||
| ASSERT_OK(builder.AppendValues(1, true)); | ||
| ASSERT_OK(builder.Finish(&block_of_bools)); | ||
| ASSERT_EQ(block_of_bools->length(), 64); | ||
| } | ||
| const auto hash_of_block = HashDataBitmap(*block_of_bools->data()); | ||
|
|
||
| const auto kStep = 13; | ||
| constexpr auto kMaxPadding = 64 + 32 + kStep; | ||
|
|
||
| for (int prefix_pad_len = 0; prefix_pad_len < kMaxPadding; prefix_pad_len += kStep) { | ||
| auto prefix_pad = BuildBooleanArray(prefix_pad_len, true); | ||
| for (int suffix_pad_len = 0; suffix_pad_len < kMaxPadding; suffix_pad_len += kStep) { | ||
| auto suffix_pad = BuildBooleanArray(suffix_pad_len, true); | ||
|
|
||
| // A block of 64 bools in the middle | ||
| auto hash = | ||
| HashConcatenation({prefix_pad, block_of_bools, suffix_pad}, prefix_pad_len, 64); | ||
| ASSERT_EQ(hash, hash_of_block); | ||
|
|
||
| // Trailing bits and leading bits around a block | ||
| for (int trailing_len = 1; trailing_len < kMaxPadding; trailing_len += kStep) { | ||
| auto trailing = BuildBooleanArray(trailing_len, true); | ||
| auto expected_hash = HashConcatenation({block_of_bools, trailing}); | ||
| auto hash = HashConcatenation({prefix_pad, block_of_bools, trailing, suffix_pad}, | ||
| prefix_pad_len, 64 + trailing_len); | ||
| ASSERT_EQ(hash, expected_hash); | ||
|
|
||
| // Use the trailing bits as leading bits now | ||
| auto leading = trailing; | ||
| auto leading_len = trailing_len; | ||
| expected_hash = HashConcatenation({leading, block_of_bools}); | ||
| hash = HashConcatenation({prefix_pad, leading, block_of_bools, suffix_pad}, | ||
| prefix_pad_len, leading_len + 64); | ||
| ASSERT_EQ(hash, expected_hash); | ||
|
|
||
| // Leading and trailing at the same time | ||
| expected_hash = HashConcatenation({leading, block_of_bools, trailing}); | ||
| hash = | ||
| HashConcatenation({prefix_pad, leading, block_of_bools, trailing, suffix_pad}, | ||
| prefix_pad_len, leading_len + 64 + trailing_len); | ||
| ASSERT_EQ(hash, expected_hash); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| } // namespace internal | ||
| } // namespace arrow | ||
Uh oh!
There was an error while loading. Please reload this page.