Skip to content

Commit d0f847f

Browse files
authored
Remove device_memory_resource inheritance from all resources and adaptors (#2301)
## Summary - Remove `device_memory_resource` inheritance from all memory resources (stateless, stateful, and adaptors) - Remove `do_allocate` / `do_deallocate` / `do_is_equal` virtual overrides from all resources - Rewrite benchmark factory functions from `shared_ptr<device_memory_resource>` to `any_device_resource` - Convert `simulated_memory_resource` from DMR inheritance to CCCL concepts - Change copy/move from `= delete` to `= default` on `cuda_async_memory_resource`, `cuda_async_managed_memory_resource`, `sam_headroom_memory_resource`, and `simulated_memory_resource` (required for CCCL `resource_ref` copyability via `shared_resource` base) - Remove NullUpstream tests and DEVICE_MEMORY_RESOURCE_VIEW_TEST (no longer needed without DMR) Closes #2295 Part of #2011
1 parent 091a079 commit d0f847f

76 files changed

Lines changed: 641 additions & 2126 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

cpp/benchmarks/async_priming/async_priming_bench.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
/*
2-
* SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
2+
* SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

66
#include <rmm/cuda_device.hpp>
77
#include <rmm/mr/cuda_async_memory_resource.hpp>
8-
#include <rmm/mr/device_memory_resource.hpp>
98

109
#include <benchmark/benchmark.h>
1110
#include <benchmarks/utilities/cxxopts.hpp>

cpp/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu

Lines changed: 18 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ __global__ void compute_bound_kernel(int64_t* out)
3737
*out = static_cast<int64_t>(clock_current);
3838
}
3939

40-
using MRFactoryFunc = std::function<std::shared_ptr<rmm::mr::device_memory_resource>()>;
40+
using any_device_resource = cuda::mr::any_resource<cuda::mr::device_accessible>;
41+
using MRFactoryFunc = std::function<any_device_resource()>;
4142

4243
static void run_prewarm(rmm::cuda_stream_pool& stream_pool, rmm::device_async_resource_ref mr)
4344
{
@@ -63,18 +64,18 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con
6364
{
6465
auto mr = factory();
6566

66-
rmm::mr::set_current_device_resource_ref(mr.get());
67+
rmm::mr::set_current_device_resource_ref(mr);
6768

6869
auto num_streams = state.range(0);
6970
auto num_kernels = state.range(1);
7071
bool do_prewarm = state.range(2) != 0;
7172

7273
auto stream_pool = rmm::cuda_stream_pool(static_cast<std::size_t>(num_streams));
7374

74-
if (do_prewarm) { run_prewarm(stream_pool, mr.get()); }
75+
if (do_prewarm) { run_prewarm(stream_pool, mr); }
7576

7677
for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores)
77-
run_test(static_cast<std::size_t>(num_kernels), stream_pool, mr.get());
78+
run_test(static_cast<std::size_t>(num_kernels), stream_pool, mr);
7879
cudaDeviceSynchronize();
7980
}
8081

@@ -83,31 +84,29 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con
8384
rmm::mr::reset_current_device_resource_ref();
8485
}
8586

86-
inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
87+
inline any_device_resource make_cuda() { return rmm::mr::cuda_memory_resource{}; }
8788

88-
inline auto make_cuda_async() { return std::make_shared<rmm::mr::cuda_async_memory_resource>(); }
89+
inline any_device_resource make_cuda_async() { return rmm::mr::cuda_async_memory_resource{}; }
8990

90-
inline auto make_pool()
91+
inline any_device_resource make_pool()
9192
{
92-
return std::make_shared<rmm::mr::pool_memory_resource>(*make_cuda(),
93-
rmm::percent_of_free_device_memory(50));
93+
rmm::mr::cuda_memory_resource cuda{};
94+
return rmm::mr::pool_memory_resource{cuda, rmm::percent_of_free_device_memory(50)};
9495
}
9596

96-
inline auto make_arena()
97+
inline any_device_resource make_arena()
9798
{
98-
return std::make_shared<rmm::mr::arena_memory_resource>(
99-
rmm::mr::get_current_device_resource_ref());
99+
return rmm::mr::arena_memory_resource{rmm::mr::get_current_device_resource_ref()};
100100
}
101101

102-
inline auto make_binning()
102+
inline any_device_resource make_binning()
103103
{
104104
// Add a binning_memory_resource with fixed-size bins of sizes 256, 512, 1024, 2048 and 4096KiB
105105
// Larger allocations will use the pool resource
106106
constexpr auto min_bin_pow2{18};
107107
constexpr auto max_bin_pow2{22};
108-
auto mr =
109-
std::make_shared<rmm::mr::binning_memory_resource>(*make_pool(), min_bin_pow2, max_bin_pow2);
110-
return mr;
108+
auto pool = make_pool();
109+
return rmm::mr::binning_memory_resource{pool, min_bin_pow2, max_bin_pow2};
111110
}
112111

113112
static void benchmark_range(benchmark::internal::Benchmark* bench)
@@ -171,9 +170,9 @@ void run_profile(std::string const& resource_name, int kernel_count, int stream_
171170
auto mr = mr_factory();
172171
auto stream_pool = rmm::cuda_stream_pool(static_cast<std::size_t>(stream_count));
173172

174-
if (prewarm) { run_prewarm(stream_pool, mr.get()); }
173+
if (prewarm) { run_prewarm(stream_pool, mr); }
175174

176-
run_test(static_cast<std::size_t>(kernel_count), stream_pool, mr.get());
175+
run_test(static_cast<std::size_t>(kernel_count), stream_pool, mr);
177176
}
178177

179178
int main(int argc, char** argv)
@@ -193,7 +192,7 @@ int main(int argc, char** argv)
193192

194193
options.add_options()( //
195194
"r,resource",
196-
"Type of device_memory_resource",
195+
"Type of memory resource",
197196
cxxopts::value<std::string>()->default_value("pool"));
198197

199198
options.add_options()( //

cpp/benchmarks/random_allocations/random_allocations.cpp

Lines changed: 21 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
#include <rmm/mr/binning_memory_resource.hpp>
99
#include <rmm/mr/cuda_async_memory_resource.hpp>
1010
#include <rmm/mr/cuda_memory_resource.hpp>
11-
#include <rmm/mr/device_memory_resource.hpp>
1211
#include <rmm/mr/per_device_resource.hpp>
1312
#include <rmm/mr/pool_memory_resource.hpp>
13+
#include <rmm/resource_ref.hpp>
1414

1515
#include <benchmark/benchmark.h>
1616
#include <benchmarks/utilities/cxxopts.hpp>
@@ -49,7 +49,7 @@ allocation remove_at(allocation_vector& allocs, std::size_t index)
4949
}
5050

5151
template <typename SizeDistribution>
52-
void random_allocation_free(rmm::mr::device_memory_resource& mr,
52+
void random_allocation_free(rmm::device_async_resource_ref mr,
5353
SizeDistribution size_distribution,
5454
std::size_t num_allocations,
5555
std::size_t max_usage, // in MiB
@@ -127,7 +127,7 @@ void random_allocation_free(rmm::mr::device_memory_resource& mr,
127127
} // namespace
128128

129129
void uniform_random_allocations(
130-
rmm::mr::device_memory_resource& mr,
130+
rmm::device_async_resource_ref mr,
131131
std::size_t num_allocations, // NOLINT(bugprone-easily-swappable-parameters)
132132
std::size_t max_allocation_size, // size in MiB
133133
std::size_t max_usage,
@@ -138,7 +138,7 @@ void uniform_random_allocations(
138138
}
139139

140140
// TODO figure out how to map a normal distribution to integers between 1 and max_allocation_size
141-
/*void normal_random_allocations(rmm::mr::device_memory_resource& mr,
141+
/*void normal_random_allocations(rmm::device_async_resource_ref mr,
142142
std::size_t num_allocations = 1000,
143143
std::size_t mean_allocation_size = 500, // in MiB
144144
std::size_t stddev_allocation_size = 500, // in MiB
@@ -148,36 +148,36 @@ void uniform_random_allocations(
148148
}*/
149149

150150
/// MR factory functions
151-
inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
151+
using any_device_resource = cuda::mr::any_resource<cuda::mr::device_accessible>;
152152

153-
inline auto make_cuda_async() { return std::make_shared<rmm::mr::cuda_async_memory_resource>(); }
153+
inline any_device_resource make_cuda() { return rmm::mr::cuda_memory_resource{}; }
154154

155-
inline auto make_pool()
155+
inline any_device_resource make_cuda_async() { return rmm::mr::cuda_async_memory_resource{}; }
156+
157+
inline any_device_resource make_pool()
156158
{
157-
return std::make_shared<rmm::mr::pool_memory_resource>(*make_cuda(),
158-
rmm::percent_of_free_device_memory(50));
159+
rmm::mr::cuda_memory_resource cuda{};
160+
return rmm::mr::pool_memory_resource{cuda, rmm::percent_of_free_device_memory(50)};
159161
}
160162

161-
inline auto make_arena()
163+
inline any_device_resource make_arena()
162164
{
163165
auto free = rmm::available_device_memory().first;
164166
constexpr auto reserve{64UL << 20}; // Leave some space for CUDA overhead.
165-
return std::make_shared<rmm::mr::arena_memory_resource>(
166-
rmm::mr::get_current_device_resource_ref(), free - reserve);
167+
return rmm::mr::arena_memory_resource{rmm::mr::get_current_device_resource_ref(), free - reserve};
167168
}
168169

169-
inline auto make_binning()
170+
inline any_device_resource make_binning()
170171
{
171172
// Add a binning_memory_resource with fixed-size bins of sizes 256, 512, 1024, 2048 and 4096KiB
172173
// Larger allocations will use the pool resource
173174
constexpr auto min_bin_pow2{18};
174175
constexpr auto max_bin_pow2{22};
175-
auto mr =
176-
std::make_shared<rmm::mr::binning_memory_resource>(*make_pool(), min_bin_pow2, max_bin_pow2);
177-
return mr;
176+
auto pool = make_pool();
177+
return rmm::mr::binning_memory_resource{pool, min_bin_pow2, max_bin_pow2};
178178
}
179179

180-
using MRFactoryFunc = std::function<std::shared_ptr<rmm::mr::device_memory_resource>()>;
180+
using MRFactoryFunc = std::function<any_device_resource()>;
181181

182182
constexpr std::size_t max_usage = 16000;
183183

@@ -190,7 +190,7 @@ static void BM_RandomAllocations(benchmark::State& state, MRFactoryFunc const& f
190190

191191
try {
192192
for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores)
193-
uniform_random_allocations(*mr, num_allocations, max_size, max_usage);
193+
uniform_random_allocations(mr, num_allocations, max_size, max_usage);
194194
}
195195
} catch (std::exception const& e) {
196196
std::cout << "Error: " << e.what() << "\n";
@@ -243,8 +243,7 @@ void declare_benchmark(std::string const& name)
243243
if (name == "cuda") {
244244
BENCHMARK_CAPTURE(BM_RandomAllocations, cuda_mr, &make_cuda) // NOLINT
245245
->Apply(benchmark_range);
246-
}
247-
if (name == "cuda_async") {
246+
} else if (name == "cuda_async") {
248247
BENCHMARK_CAPTURE(BM_RandomAllocations, cuda_async_mr, &make_cuda_async) // NOLINT
249248
->Apply(benchmark_range);
250249
} else if (name == "binning") {
@@ -268,7 +267,7 @@ static void profile_random_allocations(MRFactoryFunc const& factory,
268267
auto mr = factory();
269268

270269
try {
271-
uniform_random_allocations(*mr, num_allocations, max_size, max_usage);
270+
uniform_random_allocations(mr, num_allocations, max_size, max_usage);
272271
} catch (std::exception const& e) {
273272
std::cout << "Error: " << e.what() << "\n";
274273
}
@@ -288,7 +287,7 @@ int main(int argc, char** argv)
288287
options.add_options()(
289288
"p,profile", "Profiling mode: run once", cxxopts::value<bool>()->default_value("false"));
290289
options.add_options()("r,resource",
291-
"Type of device_memory_resource",
290+
"Type of memory resource",
292291
cxxopts::value<std::string>()->default_value("pool"));
293292
options.add_options()("n,numallocs",
294293
"Number of allocations (default of 0 tests a range)",

cpp/benchmarks/replay/replay.cpp

Lines changed: 29 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,17 @@
33
* SPDX-License-Identifier: Apache-2.0
44
*/
55

6+
#include <rmm/aligned.hpp>
67
#include <rmm/cuda_stream_view.hpp>
78
#include <rmm/detail/error.hpp>
89
#include <rmm/logger.hpp>
910
#include <rmm/mr/arena_memory_resource.hpp>
1011
#include <rmm/mr/binning_memory_resource.hpp>
1112
#include <rmm/mr/cuda_memory_resource.hpp>
12-
#include <rmm/mr/device_memory_resource.hpp>
1313
#include <rmm/mr/managed_memory_resource.hpp>
1414
#include <rmm/mr/per_device_resource.hpp>
1515
#include <rmm/mr/pool_memory_resource.hpp>
16+
#include <rmm/resource_ref.hpp>
1617

1718
#include <cuda/iterator>
1819
#include <thrust/execution_policy.h>
@@ -31,56 +32,49 @@
3132
#include <iterator>
3233
#include <memory>
3334
#include <numeric>
35+
#include <optional>
3436
#include <string>
3537
#include <thread>
3638

37-
/// MR factory functions
38-
std::shared_ptr<rmm::mr::device_memory_resource> make_cuda(std::size_t = 0)
39-
{
40-
return std::make_shared<rmm::mr::cuda_memory_resource>();
41-
}
39+
using any_device_resource = cuda::mr::any_resource<cuda::mr::device_accessible>;
4240

43-
std::shared_ptr<rmm::mr::device_memory_resource> make_managed(std::size_t = 0)
44-
{
45-
return std::make_shared<rmm::mr::managed_memory_resource>();
46-
}
41+
/// MR factory functions
42+
any_device_resource make_cuda(std::size_t = 0) { return rmm::mr::cuda_memory_resource{}; }
4743

48-
std::shared_ptr<rmm::mr::device_memory_resource> make_simulated(std::size_t simulated_size)
49-
{
50-
return std::make_shared<rmm::mr::simulated_memory_resource>(simulated_size);
51-
}
44+
any_device_resource make_managed(std::size_t = 0) { return rmm::mr::managed_memory_resource{}; }
5245

53-
inline auto make_pool(std::size_t simulated_size)
46+
inline any_device_resource make_pool(std::size_t simulated_size)
5447
{
5548
if (simulated_size > 0) {
56-
return std::make_shared<rmm::mr::pool_memory_resource>(
57-
*make_simulated(simulated_size), simulated_size, simulated_size);
49+
rmm::mr::simulated_memory_resource sim{simulated_size};
50+
return rmm::mr::pool_memory_resource{sim, simulated_size, simulated_size};
5851
}
59-
return std::make_shared<rmm::mr::pool_memory_resource>(*make_cuda(), 0);
52+
rmm::mr::cuda_memory_resource cuda{};
53+
return rmm::mr::pool_memory_resource{cuda, 0};
6054
}
6155

62-
inline auto make_arena(std::size_t simulated_size)
56+
inline any_device_resource make_arena(std::size_t simulated_size)
6357
{
6458
if (simulated_size > 0) {
65-
return std::make_shared<rmm::mr::arena_memory_resource>(
66-
rmm::mr::get_current_device_resource_ref(), simulated_size);
59+
return rmm::mr::arena_memory_resource{rmm::mr::get_current_device_resource_ref(),
60+
simulated_size};
6761
}
68-
return std::make_shared<rmm::mr::arena_memory_resource>(
69-
rmm::mr::get_current_device_resource_ref());
62+
return rmm::mr::arena_memory_resource{rmm::mr::get_current_device_resource_ref()};
7063
}
7164

72-
inline auto make_binning(std::size_t simulated_size)
65+
inline any_device_resource make_binning(std::size_t simulated_size)
7366
{
74-
auto mr = std::make_shared<rmm::mr::binning_memory_resource>(*make_pool(simulated_size));
67+
auto pool = make_pool(simulated_size);
68+
auto mr = rmm::mr::binning_memory_resource{pool};
7569
const auto min_size_exp{18};
7670
const auto max_size_exp{22};
7771
for (std::size_t i = min_size_exp; i <= max_size_exp; i++) {
78-
mr->add_bin(1 << i);
72+
mr.add_bin(1 << i);
7973
}
8074
return mr;
8175
}
8276

83-
using MRFactoryFunc = std::function<std::shared_ptr<rmm::mr::device_memory_resource>(std::size_t)>;
77+
using MRFactoryFunc = std::function<any_device_resource(std::size_t)>;
8478

8579
/**
8680
* @brief Represents an allocation made during the replay
@@ -95,15 +89,15 @@ struct allocation {
9589

9690
/**
9791
* @brief Function object for running a replay benchmark with the specified
98-
* `device_memory_resource`.
92+
* memory resource.
9993
*
100-
* @tparam MR The type of the `device_memory_resource` to use for allocation
94+
* @tparam MR The type of the memory resource to use for allocation
10195
* replay
10296
*/
10397
struct replay_benchmark {
10498
MRFactoryFunc factory_;
10599
std::size_t simulated_size_;
106-
std::shared_ptr<rmm::mr::device_memory_resource> mr_{};
100+
std::optional<any_device_resource> mr_{};
107101
std::vector<std::vector<rmm::detail::event>> const& events_{};
108102

109103
// Maps a pointer from the event log to an active allocation
@@ -173,7 +167,7 @@ struct replay_benchmark {
173167
{
174168
if (state.thread_index() == 0) {
175169
RMM_LOG_INFO("------ Start of Benchmark -----");
176-
mr_ = factory_(simulated_size_);
170+
mr_.emplace(factory_(simulated_size_));
177171
}
178172
// Can't release threads until MR is set up.
179173
barrier_.arrive_and_wait();
@@ -193,7 +187,7 @@ struct replay_benchmark {
193187
auto alloc = ptr_alloc.second;
194188
num_leaked++;
195189
total_leaked += alloc.size;
196-
mr_->deallocate_sync(alloc.ptr, alloc.size);
190+
mr_->deallocate_sync(alloc.ptr, alloc.size, rmm::CUDA_ALLOCATION_ALIGNMENT);
197191
}
198192
if (num_leaked > 0) {
199193
std::cout << "LOG shows leak of " << num_leaked << " allocations of " << total_leaked
@@ -225,11 +219,11 @@ struct replay_benchmark {
225219

226220
// rmm::detail::action::ALLOCATE_FAILURE is ignored.
227221
if (rmm::detail::action::ALLOCATE == event.act) {
228-
auto ptr = mr_->allocate_sync(event.size);
222+
auto ptr = mr_->allocate_sync(event.size, rmm::CUDA_ALLOCATION_ALIGNMENT);
229223
set_allocation(event.pointer, allocation{ptr, event.size});
230224
} else if (rmm::detail::action::FREE == event.act) {
231225
auto alloc = remove_allocation(event.pointer);
232-
mr_->deallocate_sync(alloc.ptr, event.size);
226+
mr_->deallocate_sync(alloc.ptr, event.size, rmm::CUDA_ALLOCATION_ALIGNMENT);
233227
}
234228

235229
event_index++;
@@ -355,7 +349,7 @@ int main(int argc, char** argv)
355349

356350
options.add_options()("f,file", "Name of RMM log file.", cxxopts::value<std::string>());
357351
options.add_options()("r,resource",
358-
"Type of device_memory_resource",
352+
"Type of memory resource",
359353
cxxopts::value<std::string>()->default_value("pool"));
360354
options.add_options()(
361355
"s,size",

cpp/benchmarks/utilities/log_parser.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
#include "rapidcsv.h"
99

1010
#include <rmm/detail/error.hpp>
11-
#include <rmm/mr/device_memory_resource.hpp>
1211

1312
#include <chrono>
1413
#include <cstdint>

0 commit comments

Comments
 (0)