Skip to content

Commit 9d545fb

Browse files
authored
GH-49918: [C++][Parquet] Catch std::vector allocation errors in encoding fuzzer (#49919)
### Rationale for this change The Parquet encoding fuzzer can allocate a std::vector of an arbitrary size. This can produce OOMs in the fuzzer. Issue found by OSS-Fuzz: https://issues.oss-fuzz.com/issues/506741109 ### What changes are included in this PR? 1. Use `arrow::stl::allocator` to delegate `std::vector` allocations to the fuzzing memory pool 2. Catch any `std::vector` allocation exceptions and convert them to regular Status errors ### Are these changes tested? Yes, by new regression file. ### Are there any user-facing changes? No. * GitHub Issue: #49918 Authored-by: Antoine Pitrou <antoine@python.org> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent f69ccb0 commit 9d545fb

3 files changed

Lines changed: 47 additions & 18 deletions

File tree

cpp/src/arrow/stl_allocator.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,16 @@
3030
namespace arrow {
3131
namespace stl {
3232

33+
class BadAlloc : public std::bad_alloc {
34+
public:
35+
explicit BadAlloc(Status st) noexcept : st_(std::move(st)) {}
36+
37+
const char* what() const noexcept override { return st_.message().c_str(); }
38+
39+
protected:
40+
Status st_;
41+
};
42+
3343
/// \brief A STL allocator delegating allocations to a Arrow MemoryPool
3444
template <class T>
3545
class allocator {
@@ -50,7 +60,7 @@ class allocator {
5060
/// \brief Construct an allocator from the default MemoryPool
5161
allocator() noexcept : pool_(default_memory_pool()) {}
5262
/// \brief Construct an allocator from the given MemoryPool
53-
explicit allocator(MemoryPool* pool) noexcept : pool_(pool) {}
63+
allocator(MemoryPool* pool) noexcept : pool_(pool) {} // NOLINT: runtime/explicit
5464

5565
template <class U>
5666
allocator(const allocator<U>& rhs) noexcept : pool_(rhs.pool()) {}
@@ -64,7 +74,7 @@ class allocator {
6474
pointer allocate(size_type n, const void* /*hint*/ = NULLPTR) {
6575
uint8_t* data;
6676
Status s = pool_->Allocate(n * sizeof(T), &data);
67-
if (!s.ok()) throw std::bad_alloc();
77+
if (!s.ok()) throw BadAlloc(std::move(s));
6878
return reinterpret_cast<pointer>(data);
6979
}
7080

cpp/src/parquet/arrow/fuzz_encoding_internal.cc

Lines changed: 34 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <cstring>
2323
#include <functional>
2424
#include <limits>
25+
#include <new>
2526
#include <sstream>
2627
#include <string_view>
2728

@@ -32,6 +33,7 @@
3233
#include "arrow/compare.h"
3334
#include "arrow/io/memory.h"
3435
#include "arrow/pretty_print.h"
36+
#include "arrow/stl_allocator.h"
3537
#include "arrow/type.h"
3638
#include "arrow/util/fuzz_internal.h"
3739
#include "arrow/util/logging.h"
@@ -153,6 +155,19 @@ namespace {
153155
// Just to use std::vector<T> while avoiding std::vector<bool>
154156
using BooleanSlot = std::array<uint8_t, sizeof(bool)>;
155157

158+
template <typename T>
159+
using PoolAllocator = ::arrow::stl::allocator<T>;
160+
161+
template <typename T>
162+
using PoolVector = std::vector<T, PoolAllocator<T>>;
163+
164+
#define BEGIN_CATCH_BAD_ALLOC try {
165+
#define END_CATCH_BAD_ALLOC \
166+
} \
167+
catch (const std::bad_alloc& e) { \
168+
return Status::OutOfMemory(e.what()); \
169+
}
170+
156171
template <typename DType>
157172
struct TypedFuzzEncoding {
158173
static constexpr Type::type kType = DType::type_num;
@@ -176,9 +191,9 @@ struct TypedFuzzEncoding {
176191
// decoder's internal scratch space, which get invalidated on the
177192
// following decoder call. We circumvent the issue by executing a
178193
// functor on each decoded chunk before moving to the next one.
179-
Status RunOnDecodedChunks(Encoding::type encoding,
180-
std::span<const uint8_t> encoded_data, int chunk_size,
181-
std::function<Status(int offset, std::vector<c_type>)> func) {
194+
Status RunOnDecodedChunks(
195+
Encoding::type encoding, std::span<const uint8_t> encoded_data, int chunk_size,
196+
std::function<Status(int offset, std::span<const c_type>)> func) {
182197
BEGIN_PARQUET_CATCH_EXCEPTIONS
183198
int total_values = 0;
184199
auto decoder = MakeDecoder(encoding);
@@ -191,8 +206,10 @@ struct TypedFuzzEncoding {
191206
static_cast<int>(encoded_data.size()));
192207
while (total_values < num_values_) {
193208
const int read_size = std::min(num_values_ - total_values, chunk_size);
194-
// ARROW_ASSIGN_OR_RAISE(auto chunk_values, DecodeChunk(read_size));
195-
std::vector<c_type> chunk_values(read_size);
209+
PoolVector<c_type> chunk_values(pool());
210+
BEGIN_CATCH_BAD_ALLOC
211+
chunk_values.resize(read_size);
212+
END_CATCH_BAD_ALLOC
196213
int values_read;
197214
if constexpr (kType == Type::BOOLEAN) {
198215
values_read =
@@ -201,8 +218,7 @@ struct TypedFuzzEncoding {
201218
values_read = decoder->Decode(chunk_values.data(), read_size);
202219
}
203220
ARROW_CHECK_LE(values_read, read_size);
204-
chunk_values.resize(values_read);
205-
RETURN_NOT_OK(func(total_values, std::move(chunk_values)));
221+
RETURN_NOT_OK(func(total_values, std::span(chunk_values).first(values_read)));
206222
total_values += values_read;
207223
if (values_read < chunk_size) {
208224
break;
@@ -215,14 +231,16 @@ struct TypedFuzzEncoding {
215231
return Status::OK();
216232
}
217233

218-
Result<std::vector<c_type>> Decode(Encoding::type encoding,
219-
std::span<const uint8_t> encoded_data,
220-
int chunk_size) {
234+
Result<PoolVector<c_type>> Decode(Encoding::type encoding,
235+
std::span<const uint8_t> encoded_data,
236+
int chunk_size) {
221237
// Decoded chunk values shouldn't embed pointers to decoder scratch space.
222238
static_assert(decoded_values_can_be_persisted());
223-
std::vector<c_type> values;
224-
auto accumulate_chunk = [&](int offset, std::vector<c_type> chunk_values) {
239+
PoolVector<c_type> values(pool());
240+
auto accumulate_chunk = [&](int offset, std::span<const c_type> chunk_values) {
241+
BEGIN_CATCH_BAD_ALLOC
225242
values.insert(values.end(), chunk_values.begin(), chunk_values.end());
243+
END_CATCH_BAD_ALLOC
226244
return Status::OK();
227245
};
228246
RETURN_NOT_OK(
@@ -273,7 +291,7 @@ struct TypedFuzzEncoding {
273291

274292
// Re-encode and re-decode using roundtrip encoding
275293
{
276-
auto compare_chunk = [&](int offset, std::vector<c_type> chunk_values) {
294+
auto compare_chunk = [&](int offset, std::span<const c_type> chunk_values) {
277295
return CompareChunkAgainstReference(offset, chunk_values);
278296
};
279297
auto encoder = MakeEncoder(roundtrip_encoding_);
@@ -291,7 +309,8 @@ struct TypedFuzzEncoding {
291309
chunk_size, compare_chunk));
292310
}
293311
} else {
294-
encoder->Put(reference_values_);
312+
encoder->Put(reference_values_.data(),
313+
static_cast<int>(reference_values_.size()));
295314
auto reencoded_buffer = encoder->FlushValues();
296315
auto reencoded_data = reencoded_buffer->template span_as<uint8_t>();
297316
// Vary chunk sizes
@@ -462,7 +481,7 @@ struct TypedFuzzEncoding {
462481

463482
std::shared_ptr<Array> reference_array_;
464483
// Only for INT96 as there is no strictly equivalent Arrow type
465-
std::vector<c_type> reference_values_;
484+
PoolVector<c_type> reference_values_;
466485
};
467486

468487
} // namespace

0 commit comments

Comments
 (0)