Skip to content

Commit 74373bb

Browse files
aekulardier
authored andcommitted
Fix handling of thread features for scalars in Anderson2021 (halide#7726)
* Fix handling of thread features for scalars * Remove unneeded change
1 parent 84c84c7 commit 74373bb

File tree

5 files changed

+100
-48
lines changed

5 files changed

+100
-48
lines changed

src/autoschedulers/anderson2021/LoopNest.cpp

Lines changed: 79 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -718,7 +718,7 @@ double LoopNest::compute_local_mem_stride(double stride, double bytes) const {
718718

719719
// Get the stride over "node's" storage and its element-wise stride for a unit
720720
// increment in the given thread loops
721-
Strides LoopNest::compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo &thread_info, bool verbose) const {
721+
Strides LoopNest::compute_strides(const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo *thread_info, bool verbose) const {
722722
internal_assert(innermost_storage_dim >= 0);
723723

724724
if (verbose) {
@@ -756,7 +756,7 @@ Strides LoopNest::compute_strides(const LoadJacobian &jac, int innermost_storage
756756
}
757757

758758
Strides strides{storage_strides};
759-
for (const auto &thread_loop_var : thread_info.loop_vars) {
759+
for (const auto &thread_loop_var : thread_info->loop_vars) {
760760
int loop_index = stage->get_loop_index_from_var(thread_loop_var);
761761
bool loop_index_exists = loop_index >= 0;
762762

@@ -843,7 +843,8 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_
843843
return;
844844
}
845845

846-
const ThreadInfo &thread_info = *gpu_loop_info.thread_info;
846+
internal_assert(gpu_loop_info.thread_info != nullptr);
847+
const ThreadInfo *thread_info = gpu_loop_info.thread_info;
847848
bool is_shared_mem = consumer_site.gpu_store_memory_type == GPUMemoryType::Shared;
848849

849850
size_t actual_vector_dim = get_actual_vector_dim(consumer_store_bounds);
@@ -967,18 +968,29 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_
967968
}
968969

969970
template<typename T>
970-
void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<T> &mem_info, bool verbose) const {
971+
void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<T> &mem_info, bool verbose) const {
972+
int bytes_per_access = node->bytes_per_point;
973+
974+
// If the consumer is a scalar and is compute_root, then it will not be
975+
// surrounded by a gpu_threads loop, in which case thread_info will be null.
976+
// In this case, there is no need to compute the below thread/warp-related
977+
// details because only a single point is being computed
978+
if (!thread_info && is_scalar()) {
979+
mem_info.add_access_info(num_requests_per_warp, 1, bytes_per_access);
980+
return;
981+
}
982+
983+
internal_assert(thread_info != nullptr);
984+
971985
Strides strides = compute_strides(jac, innermost_dim, node, store_bounds, thread_info, verbose);
972986

973-
size_t dimensions = thread_info.loop_indices.size();
987+
size_t dimensions = thread_info->loop_indices.size();
974988
strides.dump(verbose);
975989

976-
int bytes_per_access = node->bytes_per_point;
977-
978990
{
979-
int num_requests = thread_info.num_regular_active_warps_per_block * num_requests_per_warp;
991+
int num_requests = thread_info->num_regular_active_warps_per_block * num_requests_per_warp;
980992
Accumulator<T> accumulator(bytes_per_access, dimensions, strides, verbose);
981-
thread_info.for_each_thread_id_in_first_warp(accumulator);
993+
thread_info->for_each_thread_id_in_first_warp(accumulator);
982994

983995
accumulator.add_access_info(
984996
num_requests,
@@ -987,21 +999,21 @@ void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const
987999

9881000
if (verbose) {
9891001
aslog(2) << "num_requests_per_warp = " << num_requests_per_warp << "\n";
990-
aslog(2) << "num_regular_warps = " << thread_info.num_regular_active_warps_per_block << "\n";
1002+
aslog(2) << "num_regular_warps = " << thread_info->num_regular_active_warps_per_block << "\n";
9911003
}
9921004
}
9931005

994-
if (!thread_info.has_tail_warp) {
1006+
if (!thread_info->has_tail_warp) {
9951007
return;
9961008
}
9971009

9981010
if (verbose) {
9991011
aslog(2) << "\nBEGIN tail warp\n";
1000-
aslog(2) << "# threads in tail warp: " << thread_info.num_threads_in_final_warp << "\n";
1012+
aslog(2) << "# threads in tail warp: " << thread_info->num_threads_in_final_warp << "\n";
10011013
}
10021014

10031015
Accumulator<T> accumulator(bytes_per_access, dimensions, strides, verbose);
1004-
thread_info.for_each_thread_id_in_tail_warp(accumulator);
1016+
thread_info->for_each_thread_id_in_tail_warp(accumulator);
10051017

10061018
accumulator.add_access_info(
10071019
num_requests_per_warp,
@@ -1013,18 +1025,27 @@ void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const
10131025
}
10141026
}
10151027

1016-
template void LoopNest::compute_num_mem_accesses_per_block<GlobalMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<GlobalMem> &mem_info, bool verbose) const;
1028+
template void LoopNest::compute_num_mem_accesses_per_block<GlobalMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<GlobalMem> &mem_info, bool verbose) const;
10171029

1018-
template void LoopNest::compute_num_mem_accesses_per_block<SharedMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<SharedMem> &mem_info, bool verbose) const;
1030+
template void LoopNest::compute_num_mem_accesses_per_block<SharedMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<SharedMem> &mem_info, bool verbose) const;
10191031

10201032
template<>
1021-
void LoopNest::compute_num_mem_accesses_per_block<LocalMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<LocalMem> &mem_info, bool verbose) const {
1033+
void LoopNest::compute_num_mem_accesses_per_block<LocalMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<LocalMem> &mem_info, bool verbose) const {
10221034
int bytes_per_access = node->bytes_per_point;
10231035

1036+
// If the consumer is a scalar and is compute_root, then it will not be
1037+
// surrounded by a gpu_threads loop, in which case thread_info will be null.
1038+
// In this case, there is no need to compute the below thread/warp-related
1039+
// details because only a single point is being computed
1040+
if (!thread_info && is_scalar()) {
1041+
mem_info.add_access_info(num_requests_per_warp, 1, bytes_per_access);
1042+
return;
1043+
}
1044+
10241045
{
1025-
int num_requests = thread_info.num_regular_active_warps_per_block * num_requests_per_warp;
1046+
int num_requests = thread_info->num_regular_active_warps_per_block * num_requests_per_warp;
10261047
LocalAccessAccumulator accumulator(bytes_per_access, verbose);
1027-
thread_info.for_each_thread_id_in_first_warp(accumulator);
1048+
thread_info->for_each_thread_id_in_first_warp(accumulator);
10281049

10291050
accumulator.add_access_info(
10301051
num_requests,
@@ -1033,21 +1054,21 @@ void LoopNest::compute_num_mem_accesses_per_block<LocalMem>(const LoadJacobian &
10331054

10341055
if (verbose) {
10351056
aslog(2) << "num_requests_per_warp = " << num_requests_per_warp << "\n";
1036-
aslog(2) << "num_regular_warps = " << thread_info.num_regular_active_warps_per_block << "\n";
1057+
aslog(2) << "num_regular_warps = " << thread_info->num_regular_active_warps_per_block << "\n";
10371058
}
10381059
}
10391060

1040-
if (!thread_info.has_tail_warp) {
1061+
if (!thread_info->has_tail_warp) {
10411062
return;
10421063
}
10431064

10441065
if (verbose) {
10451066
aslog(2) << "\nBEGIN tail warp\n";
1046-
aslog(2) << "# threads in tail warp: " << thread_info.num_threads_in_final_warp << "\n";
1067+
aslog(2) << "# threads in tail warp: " << thread_info->num_threads_in_final_warp << "\n";
10471068
}
10481069

10491070
LocalAccessAccumulator accumulator(bytes_per_access, verbose);
1050-
thread_info.for_each_thread_id_in_tail_warp(accumulator);
1071+
thread_info->for_each_thread_id_in_tail_warp(accumulator);
10511072

10521073
accumulator.add_access_info(
10531074
num_requests_per_warp,
@@ -1074,19 +1095,19 @@ std::pair<double, double> LoopNest::compute_local_mem_store_features(const LoadJ
10741095
}
10751096

10761097
template<typename T>
1077-
MemInfoType<T> LoopNest::compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo &thread_info, double serial_loop_extents, bool verbose) const {
1098+
MemInfoType<T> LoopNest::compute_mem_store_info(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const {
10781099
MemInfoType<T> mem_info;
10791100

10801101
compute_num_mem_accesses_per_block<T>(jac, node, consumer_store_bounds, thread_info, consumer_innermost_dim, serial_loop_extents, mem_info, verbose);
10811102
return mem_info;
10821103
}
10831104

1084-
template MemInfoType<GlobalMem> LoopNest::compute_mem_store_info<GlobalMem>(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo &thread_info, double serial_loop_extents, bool verbose) const;
1105+
template MemInfoType<GlobalMem> LoopNest::compute_mem_store_info<GlobalMem>(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const;
10851106

1086-
template MemInfoType<SharedMem> LoopNest::compute_mem_store_info<SharedMem>(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo &thread_info, double serial_loop_extents, bool verbose) const;
1107+
template MemInfoType<SharedMem> LoopNest::compute_mem_store_info<SharedMem>(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo *thread_info, double serial_loop_extents, bool verbose) const;
10871108

10881109
template<typename T>
1089-
void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo &thread_info, MemInfoType<T> &mem_info, double points_accessed_per_thread, bool verbose) const {
1110+
void LoopNest::compute_mem_load_features(const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo *thread_info, MemInfoType<T> &mem_info, double points_accessed_per_thread, bool verbose) const {
10901111
if (producer_has_been_scheduled) {
10911112
compute_num_mem_accesses_per_block<T>(jac, node, producer_store_bounds, thread_info, producer_innermost_dim, points_accessed_per_thread, mem_info, verbose);
10921113

@@ -1115,7 +1136,7 @@ template void LoopNest::compute_mem_load_features<GlobalMem>(const LoadJacobian
11151136
const FunctionDAG::Node *node,
11161137
const Bound &producer_store_bounds,
11171138
bool producer_has_been_scheduled,
1118-
const ThreadInfo &thread_info,
1139+
const ThreadInfo *thread_info,
11191140
MemInfoType<GlobalMem> &mem_info,
11201141
double points_accessed_per_thread,
11211142
bool verbose) const;
@@ -1125,7 +1146,7 @@ template void LoopNest::compute_mem_load_features<SharedMem>(const LoadJacobian
11251146
const FunctionDAG::Node *node,
11261147
const Bound &producer_store_bounds,
11271148
bool producer_has_been_scheduled,
1128-
const ThreadInfo &thread_info,
1149+
const ThreadInfo *thread_info,
11291150
MemInfoType<SharedMem> &mem_info,
11301151
double points_accessed_per_thread,
11311152
bool verbose) const;
@@ -1136,7 +1157,7 @@ void LoopNest::compute_mem_load_features<LocalMem>(const LoadJacobian &jac,
11361157
const FunctionDAG::Node *node,
11371158
const Bound &producer_store_bounds,
11381159
bool producer_has_been_scheduled,
1139-
const ThreadInfo &thread_info,
1160+
const ThreadInfo *thread_info,
11401161
MemInfoType<LocalMem> &mem_info,
11411162
double points_accessed_per_thread,
11421163
bool verbose) const {
@@ -2163,11 +2184,13 @@ void LoopNest::compute_features(const FunctionDAG &dag,
21632184
// The store_at location of the consumer
21642185
const auto *consumer_store_site = innermost ? parent : consumer_site.store;
21652186

2187+
bool inner_serial_loop_extents_computed = false;
21662188
std::vector<int64_t> inner_serial_loop_extents;
21672189

21682190
if (innermost && !stage->store_jacobian->empty()) {
21692191
const auto &bounds = consumer_site.store->get_bounds(stage->node);
21702192
inner_serial_loop_extents = gpu_loop_info.get_inner_serial_loop_extents(this);
2193+
inner_serial_loop_extents_computed = true;
21712194
auto store_jac = *stage->store_jacobian;
21722195

21732196
compute_gpu_store_features(
@@ -2223,10 +2246,16 @@ void LoopNest::compute_features(const FunctionDAG &dag,
22232246
for (const auto &j : e->load_jacobians) {
22242247
jacobians.emplace_back(j, e->producer);
22252248

2249+
if (!inner_serial_loop_extents_computed && !is_scalar()) {
2250+
inner_serial_loop_extents = gpu_loop_info.get_inner_serial_loop_extents(this);
2251+
inner_serial_loop_extents_computed = true;
2252+
}
2253+
22262254
// Thread loops may not be innermost so in the
22272255
// Jacobians we need to account for the stride
2228-
// of the inner loops
2229-
thread_jacobians.emplace_back(j * inner_serial_loop_extents, e->producer);
2256+
// of the inner loops (but only for non-scalars,
2257+
// since scalars never have inner serial loops)
2258+
thread_jacobians.emplace_back(is_scalar() ? j : j * inner_serial_loop_extents, e->producer);
22302259
}
22312260
} else {
22322261
// Consumer was inlined. Multiply the Jacobians to look through it.
@@ -2334,7 +2363,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
23342363
e->producer,
23352364
producer_store_bounds,
23362365
producer_has_been_scheduled,
2337-
*gpu_loop_info.thread_info,
2366+
gpu_loop_info.thread_info,
23382367
shared_mem_loads,
23392368
points_accessed,
23402369
verbose);
@@ -2365,7 +2394,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
23652394
e->producer,
23662395
producer_store_bounds,
23672396
producer_has_been_scheduled,
2368-
*gpu_loop_info.thread_info,
2397+
gpu_loop_info.thread_info,
23692398
global_mem_loads,
23702399
points_accessed,
23712400
verbose);
@@ -2405,7 +2434,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
24052434
e->producer,
24062435
producer_store_bounds,
24072436
producer_has_been_scheduled,
2408-
*gpu_loop_info.thread_info,
2437+
gpu_loop_info.thread_info,
24092438
local_mem_loads,
24102439
points_accessed,
24112440
verbose);
@@ -2678,10 +2707,20 @@ void LoopNest::compute_features(const FunctionDAG &dag,
26782707
inlined_feat.outer_parallelism = parallelism;
26792708
inlined_feat.num_blocks = parallelism;
26802709

2681-
internal_assert(gpu_loop_info.thread_info);
2682-
auto num_warps = it.value() * gpu_loop_info.total_serial_extents() * gpu_loop_info.thread_info->num_warps_per_block * inlined_feat.num_blocks;
2683-
inlined_feat.num_warps_per_block += num_warps;
2684-
inlined_feat.num_threads_per_block += gpu_loop_info.thread_info->num_threads;
2710+
internal_assert(is_scalar() || gpu_loop_info.thread_info);
2711+
2712+
auto num_warps_per_block = it.value();
2713+
auto num_threads_per_block = 1;
2714+
2715+
// If the func is being inlined into a scalar, then the scalar will not
2716+
// be surrounded by block/thread/serial loops so there's no need to take
2717+
// them into account when computing these features
2718+
if (!is_scalar()) {
2719+
num_warps_per_block *= gpu_loop_info.total_serial_extents() * gpu_loop_info.thread_info->num_warps_per_block * inlined_feat.num_blocks;
2720+
num_threads_per_block = gpu_loop_info.thread_info->num_threads;
2721+
}
2722+
inlined_feat.num_warps_per_block += num_warps_per_block;
2723+
inlined_feat.num_threads_per_block += num_threads_per_block;
26852724
double points_computed_per_thread = it.value() * feat.points_computed_per_thread;
26862725
inlined_feat.points_computed_per_thread += points_computed_per_thread;
26872726

@@ -2695,9 +2734,9 @@ void LoopNest::compute_features(const FunctionDAG &dag,
26952734

26962735
intermediate.innermost_pure_loop_extent = feat.innermost_pure_loop_extent;
26972736
intermediate.outer_parallelism = parallelism;
2698-
intermediate.num_warps_per_block = num_warps;
2737+
intermediate.num_warps_per_block = num_warps_per_block;
26992738

2700-
intermediate.num_threads_per_block = gpu_loop_info.thread_info->num_threads;
2739+
intermediate.num_threads_per_block = num_threads_per_block;
27012740
intermediate.points_computed_per_thread = points_computed_per_thread;
27022741
}
27032742
}

0 commit comments

Comments
 (0)