-
Notifications
You must be signed in to change notification settings - Fork 1k
[BUG] cudf::conditional_inner_join Fails with cudaErrorIllegalAddress #16115
Description
Describe the bug
The cudf::conditional_inner_join function fails with a cudaErrorIllegalAddress error. The issue arises because the function returns a pair of vectors containing indices, but these indices size are not bounded by the int32_t limit. While the indices are valid in the case of cudf::inner_join, they probably contain garbage data in the case of cudf::conditional_inner_join.
- Note that column_view has a size of int32_t. Reference: column_view.cpp#L47.
Steps/Code to reproduce bug
TEST_F(ConditionalJoinTest, OutOfBoundJoinIndicesResult) {
auto make_table = [](int32_t size, int32_t start) -> std::unique_ptr<cudf::table> {
auto sequence_column = cudf::sequence(size, cudf::numeric_scalar<int32_t>(start));
std::vector<std::unique_ptr<cudf::column>> columns;
columns.push_back(std::move(sequence_column));
return std::make_unique<cudf::table>(std::move(columns));
};
auto split_into_int32_chunks = [](std::int64_t value) {
std::vector<std::int32_t> result;
auto chunk_size = static_cast<int64_t>(std::numeric_limits<std::int32_t>::max() - 1);
while (value > static_cast<std::int64_t>(0)) {
std::int64_t chunk = std::min(chunk_size, static_cast<std::int64_t>(value));
result.push_back(static_cast<int64_t>(chunk));
value -= chunk;
}
return result;
};
try {
auto left_table = make_table(121125, 0);
auto right_table = make_table(121125, 121125/10);
auto left_view = left_table->view();
auto right_view = right_table->view();
std::cerr << "Left size: " << left_view.num_rows()
<< ", Right size: " << right_view.num_rows() << "\n";
std::vector<int> join_column_indices = {0};
cudf::table_view left_join_view = left_view.select(join_column_indices);
cudf::table_view right_join_view = right_view.select(join_column_indices);
auto left_column_ref =
cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
auto right_column_ref =
cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
auto join_predicate = cudf::ast::operation(cudf::ast::ast_operator::LESS,
left_column_ref, right_column_ref);
auto [left_indices, right_indices] =
cudf::conditional_inner_join(left_join_view, right_join_view, join_predicate);
std::cerr << "Left indices: " << left_indices->size()
<< ", Right indices: " << right_indices->size() << "\n";
ASSERT_TRUE(left_indices->size() > std::numeric_limits<std::int32_t>::max());
ASSERT_TRUE(right_indices->size() > std::numeric_limits<std::int32_t>::max());
auto chunks = split_into_int32_chunks(left_indices->size());
std::size_t offset = 0;
for (auto chunk_size : chunks) {
auto left_indices_span = cudf::device_span<cudf::size_type const>(
left_indices->data() + offset, chunk_size);
auto right_indices_span = cudf::device_span<cudf::size_type const>(
right_indices->data() + offset, chunk_size);
cudf::column_view left_column{left_indices_span};
std::cerr << ">> : cudf::minmax"
<< "\n";
auto [min_val, max_val] = cudf::minmax(left_column);
std::cerr << "<< : cudf::minmax"
<< "\n";
cudf::column_metadata const metadata{""};
auto arrow_min = cudf::to_arrow(*min_val, metadata);
auto arrow_max = cudf::to_arrow(*max_val, metadata);
std::cerr << "Min value: " << arrow_min->ToString()
<< ", Max value: " << arrow_max->ToString() << "\n";
offset += chunk_size;
auto left_joined =
cudf::gather(left_view, cudf::column_view{left_indices_span});
}
} catch (const std::exception& e) {
std::cerr << "Caught exception: " << e.what() << "\n";
}
}
Left size: 121125, Right size: 121125
Left indices: 8729294034, Right indices: 8729294034
>> : cudf::minmax
Caught exception: CUDA error at: /home/alexander/envs/theseus_dev/include/rmm/cuda_device.hpp:115: cudaErrorIllegalAddress an illegal memory access was encountered
Running this test produces cudaErrorIllegalAddress failure.
Expected behavior
The cudf::conditional_inner_join function should return valid indices, akin to the behavior of cudf::inner_join, without triggering a cudaErrorIllegalAddress error. Alternatively, it could return a specific error indicating that the operation is not supported.
Note: The previous test functions correctly when cudf::inner_join returns indices larger than max int32_t.
** Environment details**
Method of cuDF install: source code
v24.02.00, v24.04.00, v24.06.00 branch release