apache · pitrou · Jun 1, 2023 · May 26, 2023 · May 26, 2023 · May 30, 2023
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
@@ -216,6 +216,7 @@ set(ARROW_SRCS
     util/delimiting.cc
     util/formatting.cc
     util/future.cc
+    util/hashing.cc
     util/int_util.cc
     util/io_util.cc
     util/logging.cc

diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc
@@ -29,6 +29,7 @@
 #include "arrow/compare.h"
 #include "arrow/pretty_print.h"
 #include "arrow/type.h"
+#include "arrow/util/bitmap_ops.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/decimal.h"
 #include "arrow/util/formatting.h"
@@ -151,20 +152,50 @@ struct ScalarHashImpl {
 
   Status ArrayHash(const Array& a) { return ArrayHash(*a.data()); }
 
-  Status ArrayHash(const ArrayData& a) {
-    RETURN_NOT_OK(StdHash(a.length) & StdHash(a.GetNullCount()));
-    if (a.GetNullCount() != 0 && a.buffers[0] != nullptr) {
+  Status ArrayHash(const ArraySpan& a, int64_t offset, int64_t length) {
+    // Calculate null count within the range
+    const auto* validity = a.buffers[0].data;
+    int64_t null_count = 0;
+    if (validity != NULLPTR) {
+      if (offset == a.offset && length == a.length) {
+        null_count = a.GetNullCount();
+      } else {
+        null_count = length - internal::CountSetBits(validity, offset, length);
+      }
+    }
+
+    RETURN_NOT_OK(StdHash(length) & StdHash(null_count));
+    if (null_count != 0) {
       // We can't visit values without unboxing the whole array, so only hash
       // the null bitmap for now. Only hash the null bitmap if the null count
       // is not 0 to ensure hash consistency.
-      RETURN_NOT_OK(BufferHash(*a.buffers[0]));
+      hash_ = internal::ComputeBitmapHash(validity, /*seed=*/hash_,
+                                          /*bits_offset=*/offset, /*num_bits=*/length);
     }
-    for (const auto& child : a.child_data) {
-      RETURN_NOT_OK(ArrayHash(*child));
+
+    // Hash the relevant child arrays for each type taking offset and length
+    // from the parent array into account if necessary.
+    switch (a.type->id()) {
+      case Type::STRUCT:
+        for (const auto& child : a.child_data) {
+          RETURN_NOT_OK(ArrayHash(child, offset, length));
+        }
+        break;
+        // TODO(GH-35830): Investigate what should be the correct behavior for
+        // each nested type.
+      default:
+        // By default, just hash the arrays without considering
+        // the offset and length of the parent.
+        for (const auto& child : a.child_data) {
+          RETURN_NOT_OK(ArrayHash(child));
+        }
+        break;
     }
     return Status::OK();
   }
 
+  Status ArrayHash(const ArraySpan& a) { return ArrayHash(a, a.offset, a.length); }
+
   explicit ScalarHashImpl(const Scalar& scalar) : hash_(scalar.type->Hash()) {
     AccumulateHashFrom(scalar);
   }

diff --git a/cpp/src/arrow/scalar_test.cc b/cpp/src/arrow/scalar_test.cc
@@ -1121,6 +1121,18 @@ class TestListScalar : public ::testing::Test {
     ASSERT_NE(set_bitmap_scalar->value->data()->buffers[0], nullptr);
     // ... yet it's hashing equal to the other scalar
     ASSERT_EQ(empty_bitmap_scalar.hash(), set_bitmap_scalar->hash());
+
+    // GH-35360: the hash value of a scalar from a list of structs should
+    // pay attention to the offset so it hashes the equivalent validity bitmap
+    auto list_struct_type = list(struct_({field("a", int64())}));
+    auto a =
+        ArrayFromJSON(list_struct_type, R"([[{"a": 5}, {"a": 6}], [{"a": 7}, null]])");
+    auto b = ArrayFromJSON(list_struct_type, R"([[{"a": 7}, null]])");
+    EXPECT_OK_AND_ASSIGN(auto a0, a->GetScalar(0));
+    EXPECT_OK_AND_ASSIGN(auto a1, a->GetScalar(1));
+    EXPECT_OK_AND_ASSIGN(auto b0, b->GetScalar(0));
+    ASSERT_EQ(a1->hash(), b0->hash());
+    ASSERT_NE(a0->hash(), b0->hash());
   }
 
  protected:

diff --git a/cpp/src/arrow/util/hashing.cc b/cpp/src/arrow/util/hashing.cc
@@ -0,0 +1,86 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/hashing.h"
+#include "arrow/util/bit_util.h"
+#include "arrow/util/bitmap_reader.h"
+#include "arrow/util/macros.h"
+
+namespace arrow {
+namespace internal {
+
+namespace {
+
+/// \brief A hash function for bitmaps that can handle offsets and lengths in
+/// terms of number of bits. The hash only depends on the bits actually hashed.
+///
+/// This implementation is based on 64-bit versions of MurmurHash2 by Austin Appleby.
+///
+/// It's the caller's responsibility to ensure that bits_offset + num_bits are
+/// readable from the bitmap.
+///
+/// \param key The pointer to the bitmap.
+/// \param seed The seed for the hash function (useful when chaining hash functions).
+/// \param bits_offset The offset in bits relative to the start of the bitmap.
+/// \param num_bits The number of bits after the offset to be hashed.
+uint64_t MurmurHashBitmap64(const uint8_t* key, uint64_t seed, uint64_t bits_offset,
+                            uint64_t num_bits) {
+  const uint64_t m = 0xc6a4a7935bd1e995LLU;
+  const int r = 47;
+
+  uint64_t h = seed ^ (num_bits * m);
+
+  BitmapWordReader<uint64_t> reader(key, bits_offset, num_bits);
+  auto nwords = reader.words();
+  while (nwords--) {
+    auto k = reader.NextWord();
+    k *= m;
+    k ^= k >> r;
+    k *= m;
+
+    h ^= k;
+    h *= m;
+  }
+  int valid_bits;
+  auto nbytes = reader.trailing_bytes();
+  if (nbytes) {
+    uint64_t k = 0;
+    do {
+      auto byte = reader.NextTrailingByte(valid_bits);
+      k = (k << 8) | static_cast<uint64_t>(byte);
+    } while (--nbytes);
+    h ^= k;
+    h *= m;
+  }
+
+  h ^= h >> r;
+  h *= m;
+  h ^= h >> r;
+  return h;
+}
+
+}  // namespace
+
+hash_t ComputeBitmapHash(const uint8_t* bitmap, hash_t seed, int64_t bits_offset,
+                         int64_t num_bits) {
+  DCHECK_GE(bits_offset, 0);
+  DCHECK_GE(num_bits, 0);
+  return MurmurHashBitmap64(bitmap, seed, bits_offset, num_bits);
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h
@@ -63,6 +63,23 @@ typedef uint64_t hash_t;
 template <uint64_t AlgNum>
 inline hash_t ComputeStringHash(const void* data, int64_t length);
 
+/// \brief A hash function for bitmaps that can handle offsets and lengths in
+/// terms of number of bits. The hash only depends on the bits actually hashed.
+///
+/// It's the caller's responsibility to ensure that bits_offset + num_bits are
+/// readable from the bitmap.
+///
+/// \pre bits_offset >= 0
+/// \pre num_bits >= 0
+/// \pre (bits_offset + num_bits + 7) / 8 <= readable length in bytes from bitmap
+///
+/// \param bitmap The pointer to the bitmap.
+/// \param seed The seed for the hash function (useful when chaining hash functions).
+/// \param bits_offset The offset in bits relative to the start of the bitmap.
+/// \param num_bits The number of bits after the offset to be hashed.
+ARROW_EXPORT hash_t ComputeBitmapHash(const uint8_t* bitmap, hash_t seed,
+                                      int64_t bits_offset, int64_t num_bits);
+
 template <typename Scalar, uint64_t AlgNum>
 struct ScalarHelperBase {
   static bool CompareScalars(Scalar u, Scalar v) { return u == v; }

diff --git a/cpp/src/arrow/util/hashing_test.cc b/cpp/src/arrow/util/hashing_test.cc
@@ -28,6 +28,8 @@
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
+#include "arrow/array/builder_primitive.h"
+#include "arrow/array/concatenate.h"
 #include "arrow/testing/gtest_util.h"
 #include "arrow/util/bit_util.h"
 #include "arrow/util/hashing.h"
@@ -486,5 +488,117 @@ TEST(BinaryMemoTable, Empty) {
   EXPECT_EQ(offsets[0], 0);
 }
 
+hash_t HashDataBitmap(const ArraySpan& array) {
+  EXPECT_EQ(array.type->id(), Type::BOOL);
+  const auto& bitmap = array.buffers[1];
+  return ComputeBitmapHash(bitmap.data,
+                           /*seed=*/0,
+                           /*bit_offset=*/array.offset,
+                           /*num_bits=*/array.length);
+}
+
+std::shared_ptr<BooleanArray> BuildBooleanArray(int len, bool start) {
+  // This could be memoized in the future to speed up tests.
+  BooleanBuilder builder;
+  for (int i = 0; i < len; ++i) {
+    EXPECT_TRUE(builder.Append(((i % 2 != 0) ^ start) == 1).ok());
+  }
+  std::shared_ptr<BooleanArray> array;
+  EXPECT_TRUE(builder.Finish(&array).ok());
+  return array;
+}
+
+hash_t HashConcatenation(const ArrayVector& arrays, int64_t bits_offset = -1,
+                         int64_t num_bits = -1) {
+  EXPECT_OK_AND_ASSIGN(auto concat, Concatenate(arrays));
+  EXPECT_EQ(concat->type()->id(), Type::BOOL);
+  if (bits_offset == -1 || num_bits == -1) {
+    return HashDataBitmap(*concat->data());
+  }
+  auto slice = concat->Slice(bits_offset, num_bits);
+  return HashDataBitmap(*slice->data());
+}
+
+TEST(BitmapHashTest, SmallInputs) {
+  for (bool start : {false, true}) {
+    auto block = BuildBooleanArray(64, start);
+    for (int len = 0; len < 64; len++) {
+      auto prefix = BuildBooleanArray(len, start);
+      auto expected_hash = HashDataBitmap(*prefix->data());
+
+      auto slice = block->Slice(0, len);
+      auto slice_hash = HashDataBitmap(*slice->data());
+      ASSERT_EQ(expected_hash, slice_hash);
+
+      for (int j = 1; j < len; j++) {
+        auto fragment = BuildBooleanArray(len - j, start ^ (j % 2 != 0));
+        expected_hash = HashDataBitmap(*fragment->data());
+
+        slice = block->Slice(j, len - j);
+        slice_hash = HashDataBitmap(*slice->data());
+        ASSERT_EQ(expected_hash, slice_hash);
+      }
+    }
+  }
+}
+
+TEST(BitmapHashTest, LongerInputs) {
+  BooleanBuilder builder;
+  std::shared_ptr<BooleanArray> block_of_bools;
+  {
+    ASSERT_OK(builder.AppendValues(2, true));
+    ASSERT_OK(builder.AppendValues(3, false));
+    ASSERT_OK(builder.AppendValues(5, true));
+    ASSERT_OK(builder.AppendValues(7, false));
+    ASSERT_OK(builder.AppendValues(11, true));
+    ASSERT_OK(builder.AppendValues(13, false));
+    ASSERT_OK(builder.AppendValues(17, true));
+    ASSERT_OK(builder.AppendValues(5, false));
+    ASSERT_OK(builder.AppendValues(1, true));
+    ASSERT_OK(builder.Finish(&block_of_bools));
+    ASSERT_EQ(block_of_bools->length(), 64);
+  }
+  const auto hash_of_block = HashDataBitmap(*block_of_bools->data());
+
+  const auto kStep = 13;
+  constexpr auto kMaxPadding = 64 + 32 + kStep;
+
+  for (int prefix_pad_len = 0; prefix_pad_len < kMaxPadding; prefix_pad_len += kStep) {
+    auto prefix_pad = BuildBooleanArray(prefix_pad_len, true);
+    for (int suffix_pad_len = 0; suffix_pad_len < kMaxPadding; suffix_pad_len += kStep) {
+      auto suffix_pad = BuildBooleanArray(suffix_pad_len, true);
+
+      // A block of 64 bools in the middle
+      auto hash =
+          HashConcatenation({prefix_pad, block_of_bools, suffix_pad}, prefix_pad_len, 64);
+      ASSERT_EQ(hash, hash_of_block);
+
+      // Trailing bits and leading bits around a block
+      for (int trailing_len = 1; trailing_len < kMaxPadding; trailing_len += kStep) {
+        auto trailing = BuildBooleanArray(trailing_len, true);
+        auto expected_hash = HashConcatenation({block_of_bools, trailing});
+        auto hash = HashConcatenation({prefix_pad, block_of_bools, trailing, suffix_pad},
+                                      prefix_pad_len, 64 + trailing_len);
+        ASSERT_EQ(hash, expected_hash);
+
+        // Use the trailing bits as leading bits now
+        auto leading = trailing;
+        auto leading_len = trailing_len;
+        expected_hash = HashConcatenation({leading, block_of_bools});
+        hash = HashConcatenation({prefix_pad, leading, block_of_bools, suffix_pad},
+                                 prefix_pad_len, leading_len + 64);
+        ASSERT_EQ(hash, expected_hash);
+
+        // Leading and trailing at the same time
+        expected_hash = HashConcatenation({leading, block_of_bools, trailing});
+        hash =
+            HashConcatenation({prefix_pad, leading, block_of_bools, trailing, suffix_pad},
+                              prefix_pad_len, leading_len + 64 + trailing_len);
+        ASSERT_EQ(hash, expected_hash);
+      }
+    }
+  }
+}
+
 }  // namespace internal
 }  // namespace arrow
@@ -143,6 +143,15 @@ def test_hashing():
     assert len(set_from_array) == 500
 
 
+def test_hashing_struct_scalar():
+    # GH-35360
+    a = pa.array([[{'a': 5}, {'a': 6}], [{'a': 7}, None]])
+    b = pa.array([[{'a': 7}, None]])
+    hash1 = hash(a[1])
+    hash2 = hash(b[0])
+    assert hash1 == hash2
+
+
 def test_bool():
     false = pa.scalar(False)
     true = pa.scalar(True)