Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions cpp/benchmarks/transform/transform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,10 @@ AST_TRANSFORM_BENCHMARK_DEFINE(
transform_int32_imbalanced_reuse, int32_t, TreeType::IMBALANCED_LEFT, true, false);
AST_TRANSFORM_BENCHMARK_DEFINE(
transform_double_imbalanced_unique, double, TreeType::IMBALANCED_LEFT, false, false);

AST_TRANSFORM_BENCHMARK_DEFINE(
transform_int32_imbalanced_unique_nulls, int32_t, TreeType::IMBALANCED_LEFT, false, true);
AST_TRANSFORM_BENCHMARK_DEFINE(
transform_int32_imbalanced_reuse_nulls, int32_t, TreeType::IMBALANCED_LEFT, true, true);
AST_TRANSFORM_BENCHMARK_DEFINE(
transform_double_imbalanced_unique_nulls, double, TreeType::IMBALANCED_LEFT, false, true);
66 changes: 66 additions & 0 deletions cpp/include/cudf/strings/detail/strings_column_factories.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,72 @@ std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
std::move(null_mask));
}

namespace {
__device__ size_type masked_size(size_type idx,
string_view const* string_views,
bitmask_type const* null_mask,
size_type alt)
{
if (null_mask != nullptr) {
return bit_is_set(null_mask, idx) ? string_views[idx].size_bytes() : alt;
} else {
return string_views[idx].size_bytes();
}
}
} // namespace

/**
* @brief Create a strings-type column from string_views and a null mask.
*
* @param string_views The string_views representing the string data
* @param null_mask The validity bitmask in Arrow format
* @param null_count Number of null rows
* @param stream CUDA stream used for device memory operations
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings column
*/
inline std::unique_ptr<column> make_strings_column_with_null_mask(
device_span<string_view const> string_views,
rmm::device_buffer null_mask,
size_type null_count,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
if (string_views.empty()) { return make_empty_column(type_id::STRING); }

// construct offsets column from the string sizes, using the null mask to set null string sizes to
// 0
auto string_views_ptr = string_views.data();
auto null_mask_ptr =
null_mask.is_empty() ? nullptr : static_cast<bitmask_type const*>(null_mask.data());

auto size_transformer = [string_views_ptr, null_mask_ptr] __device__(size_type idx) -> size_type {
return masked_size(idx, string_views_ptr, null_mask_ptr, 0);
};

auto offsets_iter =
cudf::detail::make_counting_transform_iterator(size_type{0}, size_transformer);

auto [offsets_column, total_bytes] =
make_offsets_child_column(offsets_iter, offsets_iter + string_views.size(), stream, mr);

// build chars column, using the null mask to set null string sizes to 0
auto chars_iter = cudf::detail::make_counting_transform_iterator(
size_type{0}, [string_views_ptr, null_mask_ptr] __device__(size_type idx) -> string_index_pair {
return string_index_pair{string_views_ptr[idx].data(),
masked_size(idx, string_views_ptr, null_mask_ptr, 0)};
});

auto chars_data = make_chars_buffer(
offsets_column->view(), total_bytes, chars_iter, string_views.size(), stream, mr);

return make_strings_column(string_views.size(),
std::move(offsets_column),
chars_data.release(),
null_count,
std::move(null_mask));
}

} // namespace detail
} // namespace strings
} // namespace cudf
2 changes: 2 additions & 0 deletions cpp/src/jit/span.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ struct device_optional_span : device_span<T> {
/// @copydoc column_device_view::nullable
[[nodiscard]] CUDF_HOST_DEVICE bool nullable() const { return _null_mask != nullptr; }

[[nodiscard]] CUDF_HOST_DEVICE bitmask_type const* null_mask() const { return _null_mask; }

#ifdef __CUDACC__

/// @copydoc column_device_view::is_valid_nocheck
Expand Down
85 changes: 85 additions & 0 deletions cpp/src/jit/transform_bit_utils.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/

#pragma once
#include <cudf/types.hpp>

#include <cuda/std/algorithm>

namespace CUDF_EXPORT cudf {
namespace transformation {
namespace jit {

/// @param total Pointer to global memory to accumulate the total. Must be initialized to zero.
/// @param thread_total The per-thread total to add to the global total.
__device__ void device_reduce_sum(cudf::size_type* total, cudf::size_type thread_total)
{
static_assert(sizeof(cudf::size_type) <= sizeof(unsigned int));

cudf::size_type warp_total = thread_total;

auto participation_mask = __activemask();

for (int num_warp_sums = 16; num_warp_sums > 0; num_warp_sums /= 2) {
warp_total += __shfl_down_sync(participation_mask, warp_total, num_warp_sums);
}

if (threadIdx.x == 0) { atomicAdd(total, warp_total); }
}

/// @brief Compute the null bitmask of chunks (word-sized) from a boolean source array.
/// @param src The source boolean array
/// @param word_chunk_start The starting word chunk index to process
/// @param num_word_chunks The number of word chunks to process
/// @param dst_word The output bitmask word
/// @return The number of valid (set) bits written to the null mask
__device__ cudf::size_type bools_to_bits_chunk(bool const* __restrict__ src,
cudf::size_type src_size,
cudf::size_type word_chunk,
cudf::bitmask_type* __restrict__ dst_word)
{
static_assert(sizeof(cudf::bitmask_type) <= sizeof(unsigned int));

static constexpr auto num_word_bits =
static_cast<cudf::size_type>(sizeof(cudf::bitmask_type) * 8);

auto bit_start = word_chunk * num_word_bits;
auto bit_end = cuda::std::min(bit_start + num_word_bits, src_size);

cudf::bitmask_type out_word = 0;
for (auto b = bit_start; b < bit_end; b++) {
auto bit_pos = (b % num_word_bits);
auto bits = (src[b] ? cudf::bitmask_type{1} : cudf::bitmask_type{0}) << bit_pos;
out_word |= bits;
}

*dst_word = out_word;

return __popc(out_word);
}

__device__ void boolean_mask_to_nullmask_subkernel(bool const* __restrict__ src,
cudf::size_type src_size,
cudf::bitmask_type* __restrict__ dst,
cudf::size_type* __restrict__ valid_count)
{
constexpr auto num_word_bits = static_cast<cudf::size_type>(sizeof(cudf::bitmask_type) * 8);
auto num_chunks = (src_size + (num_word_bits - 1)) / num_word_bits;
cudf::size_type thread_valid_count = 0;

auto i = static_cast<int64_t>(threadIdx.x) +
static_cast<int64_t>(blockIdx.x) * static_cast<int64_t>(blockDim.x);
auto stride = static_cast<int64_t>(blockDim.x) * static_cast<int64_t>(gridDim.x);

for (; i < num_chunks; i += stride) {
thread_valid_count += bools_to_bits_chunk(src, src_size, i, &dst[i]);
}

device_reduce_sum(valid_count, thread_valid_count);
}

} // namespace jit
} // namespace transformation
} // namespace CUDF_EXPORT cudf
49 changes: 46 additions & 3 deletions cpp/src/transform/jit/kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#include <jit/accessors.cuh>
#include <jit/span.cuh>
#include <jit/transform_bit_utils.cuh>

#pragma nv_hdrstop // The above headers are used by the kernel below and need to be included before
// it. Each UDF will have a different operation-udf.hpp generated for it, so we
Expand All @@ -38,6 +39,7 @@ template <null_aware is_null_aware,
CUDF_KERNEL void kernel(cudf::mutable_column_device_view_core const* outputs,
cudf::column_device_view_core const* inputs,
bool* intermediate_null_mask,
cudf::size_type* valid_count,
void* user_data)
{
// inputs to JITIFY kernels have to be either sized-integral types or pointers. Structs or
Expand All @@ -50,7 +52,12 @@ CUDF_KERNEL void kernel(cudf::mutable_column_device_view_core const* outputs,
for (auto i = start; i < size; i += stride) {
if constexpr (is_null_aware == null_aware::NO) {
if constexpr (may_evaluate_null) {
if (Out::is_null(outputs, i)) { continue; }
if ((true && ... && In::is_valid(inputs, i))) {
intermediate_null_mask[i] = true;
} else {
intermediate_null_mask[i] = false;
continue;
}
}

if constexpr (has_user_data) {
Expand All @@ -73,6 +80,14 @@ CUDF_KERNEL void kernel(cudf::mutable_column_device_view_core const* outputs,
if constexpr (may_evaluate_null) { intermediate_null_mask[i] = result.has_value(); }
}
}

if constexpr (may_evaluate_null) {
__threadfence_system(); // ensure intermediate null mask is visible to other threads before
// reduction

boolean_mask_to_nullmask_subkernel(
intermediate_null_mask, size, outputs[0].null_mask(), valid_count);
}
}

template <null_aware is_null_aware,
Expand All @@ -83,6 +98,7 @@ template <null_aware is_null_aware,
CUDF_KERNEL void fixed_point_kernel(cudf::mutable_column_device_view_core const* outputs,
cudf::column_device_view_core const* inputs,
bool* intermediate_null_mask,
cudf::size_type* valid_count,
void* user_data)
{
auto const start = cudf::detail::grid_1d::global_thread_id();
Expand All @@ -93,7 +109,12 @@ CUDF_KERNEL void fixed_point_kernel(cudf::mutable_column_device_view_core const*
for (auto i = start; i < size; i += stride) {
if constexpr (is_null_aware == null_aware::NO) {
if constexpr (may_evaluate_null) {
if (Out::is_null(outputs, i)) { continue; }
if ((true && ... && In::is_valid(inputs, i))) {
intermediate_null_mask[i] = true;
} else {
intermediate_null_mask[i] = false;
continue;
}
}

typename Out::type result{numeric::scaled_integer<typename Out::type::rep>{0, output_scale}};
Expand Down Expand Up @@ -121,6 +142,14 @@ CUDF_KERNEL void fixed_point_kernel(cudf::mutable_column_device_view_core const*
if constexpr (may_evaluate_null) { intermediate_null_mask[i] = result.has_value(); }
}
}

if constexpr (may_evaluate_null) {
__threadfence_system(); // ensure intermediate null mask is visible to other threads before
// reduction

boolean_mask_to_nullmask_subkernel(
intermediate_null_mask, size, outputs[0].null_mask(), valid_count);
}
}

template <null_aware is_null_aware,
Expand All @@ -131,6 +160,7 @@ template <null_aware is_null_aware,
CUDF_KERNEL void span_kernel(cudf::jit::device_optional_span<typename Out::type> const* outputs,
cudf::column_device_view_core const* inputs,
bool* intermediate_null_mask,
cudf::size_type* valid_count,
void* user_data)
{
auto const start = cudf::detail::grid_1d::global_thread_id();
Expand All @@ -140,7 +170,12 @@ CUDF_KERNEL void span_kernel(cudf::jit::device_optional_span<typename Out::type>
for (auto i = start; i < size; i += stride) {
if constexpr (is_null_aware == null_aware::NO) {
if constexpr (may_evaluate_null) {
if (Out::is_null(outputs, i)) { continue; }
if ((true && ... && In::is_valid(inputs, i))) {
intermediate_null_mask[i] = true;
} else {
intermediate_null_mask[i] = false;
continue;
}
}

if constexpr (has_user_data) {
Expand All @@ -162,6 +197,14 @@ CUDF_KERNEL void span_kernel(cudf::jit::device_optional_span<typename Out::type>
if constexpr (may_evaluate_null) { intermediate_null_mask[i] = result.has_value(); }
}
}

if constexpr (may_evaluate_null) {
__threadfence_system(); // ensure intermediate null mask is visible to other threads before
// reduction

boolean_mask_to_nullmask_subkernel(
intermediate_null_mask, size, outputs[0].null_mask(), valid_count);
}
}

} // namespace jit
Expand Down
Loading