diff --git a/cpp/CMakePresets.json b/cpp/CMakePresets.json index c9e2444389f4..0c3f85d0917f 100644 --- a/cpp/CMakePresets.json +++ b/cpp/CMakePresets.json @@ -444,7 +444,8 @@ "CMAKE_CXX_COMPILER": "clang++", "ARROW_IPC": "ON", "ARROW_PARQUET": "ON", - "ARROW_FUZZING": "ON" + "ARROW_FUZZING": "ON", + "ARROW_WITH_SNAPPY": "ON" } }, { diff --git a/cpp/src/arrow/util/rle_encoding_internal.h b/cpp/src/arrow/util/rle_encoding_internal.h index c231c9a63ebb..a7917483bbc9 100644 --- a/cpp/src/arrow/util/rle_encoding_internal.h +++ b/cpp/src/arrow/util/rle_encoding_internal.h @@ -657,13 +657,14 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const const auto header_bytes = bit_util::ParseLeadingLEB128(data_, kMaxSize, &run_len_type); if (ARROW_PREDICT_FALSE(header_bytes == 0)) { - // Malfomrmed LEB128 data + // Malformed LEB128 data return {0, ControlFlow::Break}; } const bool is_bit_packed = run_len_type & 1; const uint32_t count = run_len_type >> 1; if (is_bit_packed) { + // Bit-packed run constexpr auto kMaxCount = bit_util::CeilDiv(internal::max_size_for_v, 8); if (ARROW_PREDICT_FALSE(count == 0 || count > kMaxCount)) { // Illegal number of encoded values @@ -672,17 +673,21 @@ auto RleBitPackedParser::PeekImpl(Handler&& handler) const ARROW_DCHECK_LT(static_cast(count) * 8, internal::max_size_for_v); + // Count Already divided by 8 for byte size calculations + const auto bytes_read = header_bytes + static_cast(count) * value_bit_width_; + if (ARROW_PREDICT_FALSE(bytes_read > data_size_)) { + // Bit-packed run would overflow data buffer + return {0, ControlFlow::Break}; + } const auto values_count = static_cast(count * 8); - // Count Already divided by 8 - const auto bytes_read = - header_bytes + static_cast(count) * value_bit_width_; auto control = handler.OnBitPackedRun( BitPackedRun(data_ + header_bytes, values_count, value_bit_width_)); - return {bytes_read, control}; + return {static_cast(bytes_read), control}; } + // RLE run if (ARROW_PREDICT_FALSE(count == 0)) { // Illegal number of encoded values return {0, ControlFlow::Break}; @@ -1079,7 +1084,6 @@ auto RleBitPackedDecoder::GetSpaced(Converter converter, // There may be remaining null if they are not greedily filled by either decoder calls check_and_handle_fully_null_remaining(); - ARROW_DCHECK(batch.is_done() || exhausted()); return batch.total_read(); } diff --git a/cpp/src/parquet/decoder.cc b/cpp/src/parquet/decoder.cc index 46d1c201e927..b6d79665621f 100644 --- a/cpp/src/parquet/decoder.cc +++ b/cpp/src/parquet/decoder.cc @@ -2082,9 +2082,12 @@ class DeltaByteArrayDecoderImpl : public TypedDecoderImpl { int64_t valid_bits_offset, typename EncodingTraits::Accumulator* out, int* out_num_values) { - std::vector values(num_values); + std::vector values(num_values - null_count); const int num_valid_values = GetInternal(values.data(), num_values - null_count); - DCHECK_EQ(num_values - null_count, num_valid_values); + if (ARROW_PREDICT_FALSE(num_values - null_count != num_valid_values)) { + throw ParquetException("Expected to decode ", num_values - null_count, + " values, but decoded ", num_valid_values, " values."); + } auto visit_binary_helper = [&](auto* helper) { auto values_ptr = reinterpret_cast(values.data()); diff --git a/testing b/testing index 6a7b02fac93d..abf6d7ebde7a 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 6a7b02fac93d8addbcdbb213264e58bfdc3068e4 +Subproject commit abf6d7ebde7ab70b541c51859dad2bef71a0151e