@@ -718,7 +718,7 @@ double LoopNest::compute_local_mem_stride(double stride, double bytes) const {
718718
719719// Get the stride over "node's" storage and its element-wise stride for a unit
720720// increment in the given thread loops
721- Strides LoopNest::compute_strides (const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo & thread_info, bool verbose) const {
721+ Strides LoopNest::compute_strides (const LoadJacobian &jac, int innermost_storage_dim, const FunctionDAG::Node *storage_node, const Bound &store_bounds, const ThreadInfo * thread_info, bool verbose) const {
722722 internal_assert (innermost_storage_dim >= 0 );
723723
724724 if (verbose) {
@@ -756,7 +756,7 @@ Strides LoopNest::compute_strides(const LoadJacobian &jac, int innermost_storage
756756 }
757757
758758 Strides strides{storage_strides};
759- for (const auto &thread_loop_var : thread_info. loop_vars ) {
759+ for (const auto &thread_loop_var : thread_info-> loop_vars ) {
760760 int loop_index = stage->get_loop_index_from_var (thread_loop_var);
761761 bool loop_index_exists = loop_index >= 0 ;
762762
@@ -843,7 +843,8 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_
843843 return ;
844844 }
845845
846- const ThreadInfo &thread_info = *gpu_loop_info.thread_info ;
846+ internal_assert (gpu_loop_info.thread_info != nullptr );
847+ const ThreadInfo *thread_info = gpu_loop_info.thread_info ;
847848 bool is_shared_mem = consumer_site.gpu_store_memory_type == GPUMemoryType::Shared;
848849
849850 size_t actual_vector_dim = get_actual_vector_dim (consumer_store_bounds);
@@ -967,18 +968,29 @@ void LoopNest::compute_gpu_store_features(const LoadJacobian &jac, int consumer_
967968}
968969
969970template <typename T>
970- void LoopNest::compute_num_mem_accesses_per_block (const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo &thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<T> &mem_info, bool verbose) const {
971+ void LoopNest::compute_num_mem_accesses_per_block (const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo *thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<T> &mem_info, bool verbose) const {
972+ int bytes_per_access = node->bytes_per_point ;
973+
974+ // If the consumer is a scalar and is compute_root, then it will not be
975+ // surrounded by a gpu_threads loop, in which case thread_info will be null.
976+ // In this case, there is no need to compute the below thread/warp-related
977+ // details because only a single point is being computed
978+ if (!thread_info && is_scalar ()) {
979+ mem_info.add_access_info (num_requests_per_warp, 1 , bytes_per_access);
980+ return ;
981+ }
982+
983+ internal_assert (thread_info != nullptr );
984+
971985 Strides strides = compute_strides (jac, innermost_dim, node, store_bounds, thread_info, verbose);
972986
973- size_t dimensions = thread_info. loop_indices .size ();
987+ size_t dimensions = thread_info-> loop_indices .size ();
974988 strides.dump (verbose);
975989
976- int bytes_per_access = node->bytes_per_point ;
977-
978990 {
979- int num_requests = thread_info. num_regular_active_warps_per_block * num_requests_per_warp;
991+ int num_requests = thread_info-> num_regular_active_warps_per_block * num_requests_per_warp;
980992 Accumulator<T> accumulator (bytes_per_access, dimensions, strides, verbose);
981- thread_info. for_each_thread_id_in_first_warp (accumulator);
993+ thread_info-> for_each_thread_id_in_first_warp (accumulator);
982994
983995 accumulator.add_access_info (
984996 num_requests,
@@ -987,21 +999,21 @@ void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const
987999
9881000 if (verbose) {
9891001 aslog (2 ) << " num_requests_per_warp = " << num_requests_per_warp << " \n " ;
990- aslog (2 ) << " num_regular_warps = " << thread_info. num_regular_active_warps_per_block << " \n " ;
1002+ aslog (2 ) << " num_regular_warps = " << thread_info-> num_regular_active_warps_per_block << " \n " ;
9911003 }
9921004 }
9931005
994- if (!thread_info. has_tail_warp ) {
1006+ if (!thread_info-> has_tail_warp ) {
9951007 return ;
9961008 }
9971009
9981010 if (verbose) {
9991011 aslog (2 ) << " \n BEGIN tail warp\n " ;
1000- aslog (2 ) << " # threads in tail warp: " << thread_info. num_threads_in_final_warp << " \n " ;
1012+ aslog (2 ) << " # threads in tail warp: " << thread_info-> num_threads_in_final_warp << " \n " ;
10011013 }
10021014
10031015 Accumulator<T> accumulator (bytes_per_access, dimensions, strides, verbose);
1004- thread_info. for_each_thread_id_in_tail_warp (accumulator);
1016+ thread_info-> for_each_thread_id_in_tail_warp (accumulator);
10051017
10061018 accumulator.add_access_info (
10071019 num_requests_per_warp,
@@ -1013,18 +1025,27 @@ void LoopNest::compute_num_mem_accesses_per_block(const LoadJacobian &jac, const
10131025 }
10141026}
10151027
1016- template void LoopNest::compute_num_mem_accesses_per_block<GlobalMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo & thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<GlobalMem> &mem_info, bool verbose) const ;
1028+ template void LoopNest::compute_num_mem_accesses_per_block<GlobalMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo * thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<GlobalMem> &mem_info, bool verbose) const ;
10171029
1018- template void LoopNest::compute_num_mem_accesses_per_block<SharedMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo & thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<SharedMem> &mem_info, bool verbose) const ;
1030+ template void LoopNest::compute_num_mem_accesses_per_block<SharedMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo * thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<SharedMem> &mem_info, bool verbose) const ;
10191031
10201032template <>
1021- void LoopNest::compute_num_mem_accesses_per_block<LocalMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo & thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<LocalMem> &mem_info, bool verbose) const {
1033+ void LoopNest::compute_num_mem_accesses_per_block<LocalMem>(const LoadJacobian &jac, const FunctionDAG::Node *node, const Bound &store_bounds, const ThreadInfo * thread_info, int innermost_dim, double num_requests_per_warp, MemInfoType<LocalMem> &mem_info, bool verbose) const {
10221034 int bytes_per_access = node->bytes_per_point ;
10231035
1036+ // If the consumer is a scalar and is compute_root, then it will not be
1037+ // surrounded by a gpu_threads loop, in which case thread_info will be null.
1038+ // In this case, there is no need to compute the below thread/warp-related
1039+ // details because only a single point is being computed
1040+ if (!thread_info && is_scalar ()) {
1041+ mem_info.add_access_info (num_requests_per_warp, 1 , bytes_per_access);
1042+ return ;
1043+ }
1044+
10241045 {
1025- int num_requests = thread_info. num_regular_active_warps_per_block * num_requests_per_warp;
1046+ int num_requests = thread_info-> num_regular_active_warps_per_block * num_requests_per_warp;
10261047 LocalAccessAccumulator accumulator (bytes_per_access, verbose);
1027- thread_info. for_each_thread_id_in_first_warp (accumulator);
1048+ thread_info-> for_each_thread_id_in_first_warp (accumulator);
10281049
10291050 accumulator.add_access_info (
10301051 num_requests,
@@ -1033,21 +1054,21 @@ void LoopNest::compute_num_mem_accesses_per_block<LocalMem>(const LoadJacobian &
10331054
10341055 if (verbose) {
10351056 aslog (2 ) << " num_requests_per_warp = " << num_requests_per_warp << " \n " ;
1036- aslog (2 ) << " num_regular_warps = " << thread_info. num_regular_active_warps_per_block << " \n " ;
1057+ aslog (2 ) << " num_regular_warps = " << thread_info-> num_regular_active_warps_per_block << " \n " ;
10371058 }
10381059 }
10391060
1040- if (!thread_info. has_tail_warp ) {
1061+ if (!thread_info-> has_tail_warp ) {
10411062 return ;
10421063 }
10431064
10441065 if (verbose) {
10451066 aslog (2 ) << " \n BEGIN tail warp\n " ;
1046- aslog (2 ) << " # threads in tail warp: " << thread_info. num_threads_in_final_warp << " \n " ;
1067+ aslog (2 ) << " # threads in tail warp: " << thread_info-> num_threads_in_final_warp << " \n " ;
10471068 }
10481069
10491070 LocalAccessAccumulator accumulator (bytes_per_access, verbose);
1050- thread_info. for_each_thread_id_in_tail_warp (accumulator);
1071+ thread_info-> for_each_thread_id_in_tail_warp (accumulator);
10511072
10521073 accumulator.add_access_info (
10531074 num_requests_per_warp,
@@ -1074,19 +1095,19 @@ std::pair<double, double> LoopNest::compute_local_mem_store_features(const LoadJ
10741095}
10751096
10761097template <typename T>
1077- MemInfoType<T> LoopNest::compute_mem_store_info (const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo & thread_info, double serial_loop_extents, bool verbose) const {
1098+ MemInfoType<T> LoopNest::compute_mem_store_info (const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo * thread_info, double serial_loop_extents, bool verbose) const {
10781099 MemInfoType<T> mem_info;
10791100
10801101 compute_num_mem_accesses_per_block<T>(jac, node, consumer_store_bounds, thread_info, consumer_innermost_dim, serial_loop_extents, mem_info, verbose);
10811102 return mem_info;
10821103}
10831104
1084- template MemInfoType<GlobalMem> LoopNest::compute_mem_store_info<GlobalMem>(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo & thread_info, double serial_loop_extents, bool verbose) const ;
1105+ template MemInfoType<GlobalMem> LoopNest::compute_mem_store_info<GlobalMem>(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo * thread_info, double serial_loop_extents, bool verbose) const ;
10851106
1086- template MemInfoType<SharedMem> LoopNest::compute_mem_store_info<SharedMem>(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo & thread_info, double serial_loop_extents, bool verbose) const ;
1107+ template MemInfoType<SharedMem> LoopNest::compute_mem_store_info<SharedMem>(const LoadJacobian &jac, int consumer_innermost_dim, const FunctionDAG::Node *node, const Bound &consumer_store_bounds, const ThreadInfo * thread_info, double serial_loop_extents, bool verbose) const ;
10871108
10881109template <typename T>
1089- void LoopNest::compute_mem_load_features (const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo & thread_info, MemInfoType<T> &mem_info, double points_accessed_per_thread, bool verbose) const {
1110+ void LoopNest::compute_mem_load_features (const LoadJacobian &jac, int producer_innermost_dim, const FunctionDAG::Node *node, const Bound &producer_store_bounds, bool producer_has_been_scheduled, const ThreadInfo * thread_info, MemInfoType<T> &mem_info, double points_accessed_per_thread, bool verbose) const {
10901111 if (producer_has_been_scheduled) {
10911112 compute_num_mem_accesses_per_block<T>(jac, node, producer_store_bounds, thread_info, producer_innermost_dim, points_accessed_per_thread, mem_info, verbose);
10921113
@@ -1115,7 +1136,7 @@ template void LoopNest::compute_mem_load_features<GlobalMem>(const LoadJacobian
11151136 const FunctionDAG::Node *node,
11161137 const Bound &producer_store_bounds,
11171138 bool producer_has_been_scheduled,
1118- const ThreadInfo & thread_info,
1139+ const ThreadInfo * thread_info,
11191140 MemInfoType<GlobalMem> &mem_info,
11201141 double points_accessed_per_thread,
11211142 bool verbose) const ;
@@ -1125,7 +1146,7 @@ template void LoopNest::compute_mem_load_features<SharedMem>(const LoadJacobian
11251146 const FunctionDAG::Node *node,
11261147 const Bound &producer_store_bounds,
11271148 bool producer_has_been_scheduled,
1128- const ThreadInfo & thread_info,
1149+ const ThreadInfo * thread_info,
11291150 MemInfoType<SharedMem> &mem_info,
11301151 double points_accessed_per_thread,
11311152 bool verbose) const ;
@@ -1136,7 +1157,7 @@ void LoopNest::compute_mem_load_features<LocalMem>(const LoadJacobian &jac,
11361157 const FunctionDAG::Node *node,
11371158 const Bound &producer_store_bounds,
11381159 bool producer_has_been_scheduled,
1139- const ThreadInfo & thread_info,
1160+ const ThreadInfo * thread_info,
11401161 MemInfoType<LocalMem> &mem_info,
11411162 double points_accessed_per_thread,
11421163 bool verbose) const {
@@ -2163,11 +2184,13 @@ void LoopNest::compute_features(const FunctionDAG &dag,
21632184 // The store_at location of the consumer
21642185 const auto *consumer_store_site = innermost ? parent : consumer_site.store ;
21652186
2187+ bool inner_serial_loop_extents_computed = false ;
21662188 std::vector<int64_t > inner_serial_loop_extents;
21672189
21682190 if (innermost && !stage->store_jacobian ->empty ()) {
21692191 const auto &bounds = consumer_site.store ->get_bounds (stage->node );
21702192 inner_serial_loop_extents = gpu_loop_info.get_inner_serial_loop_extents (this );
2193+ inner_serial_loop_extents_computed = true ;
21712194 auto store_jac = *stage->store_jacobian ;
21722195
21732196 compute_gpu_store_features (
@@ -2223,10 +2246,16 @@ void LoopNest::compute_features(const FunctionDAG &dag,
22232246 for (const auto &j : e->load_jacobians ) {
22242247 jacobians.emplace_back (j, e->producer );
22252248
2249+ if (!inner_serial_loop_extents_computed && !is_scalar ()) {
2250+ inner_serial_loop_extents = gpu_loop_info.get_inner_serial_loop_extents (this );
2251+ inner_serial_loop_extents_computed = true ;
2252+ }
2253+
22262254 // Thread loops may not be innermost so in the
22272255 // Jacobians we need to account for the stride
2228- // of the inner loops
2229- thread_jacobians.emplace_back (j * inner_serial_loop_extents, e->producer );
2256+ // of the inner loops (but only for non-scalars,
2257+ // since scalars never have inner serial loops)
2258+ thread_jacobians.emplace_back (is_scalar () ? j : j * inner_serial_loop_extents, e->producer );
22302259 }
22312260 } else {
22322261 // Consumer was inlined. Multiply the Jacobians to look through it.
@@ -2334,7 +2363,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
23342363 e->producer ,
23352364 producer_store_bounds,
23362365 producer_has_been_scheduled,
2337- * gpu_loop_info.thread_info ,
2366+ gpu_loop_info.thread_info ,
23382367 shared_mem_loads,
23392368 points_accessed,
23402369 verbose);
@@ -2365,7 +2394,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
23652394 e->producer ,
23662395 producer_store_bounds,
23672396 producer_has_been_scheduled,
2368- * gpu_loop_info.thread_info ,
2397+ gpu_loop_info.thread_info ,
23692398 global_mem_loads,
23702399 points_accessed,
23712400 verbose);
@@ -2405,7 +2434,7 @@ void LoopNest::compute_features(const FunctionDAG &dag,
24052434 e->producer ,
24062435 producer_store_bounds,
24072436 producer_has_been_scheduled,
2408- * gpu_loop_info.thread_info ,
2437+ gpu_loop_info.thread_info ,
24092438 local_mem_loads,
24102439 points_accessed,
24112440 verbose);
@@ -2678,10 +2707,20 @@ void LoopNest::compute_features(const FunctionDAG &dag,
26782707 inlined_feat.outer_parallelism = parallelism;
26792708 inlined_feat.num_blocks = parallelism;
26802709
2681- internal_assert (gpu_loop_info.thread_info );
2682- auto num_warps = it.value () * gpu_loop_info.total_serial_extents () * gpu_loop_info.thread_info ->num_warps_per_block * inlined_feat.num_blocks ;
2683- inlined_feat.num_warps_per_block += num_warps;
2684- inlined_feat.num_threads_per_block += gpu_loop_info.thread_info ->num_threads ;
2710+ internal_assert (is_scalar () || gpu_loop_info.thread_info );
2711+
2712+ auto num_warps_per_block = it.value ();
2713+ auto num_threads_per_block = 1 ;
2714+
2715+ // If the func is being inlined into a scalar, then the scalar will not
2716+ // be surrounded by block/thread/serial loops so there's no need to take
2717+ // them into account when computing these features
2718+ if (!is_scalar ()) {
2719+ num_warps_per_block *= gpu_loop_info.total_serial_extents () * gpu_loop_info.thread_info ->num_warps_per_block * inlined_feat.num_blocks ;
2720+ num_threads_per_block = gpu_loop_info.thread_info ->num_threads ;
2721+ }
2722+ inlined_feat.num_warps_per_block += num_warps_per_block;
2723+ inlined_feat.num_threads_per_block += num_threads_per_block;
26852724 double points_computed_per_thread = it.value () * feat.points_computed_per_thread ;
26862725 inlined_feat.points_computed_per_thread += points_computed_per_thread;
26872726
@@ -2695,9 +2734,9 @@ void LoopNest::compute_features(const FunctionDAG &dag,
26952734
26962735 intermediate.innermost_pure_loop_extent = feat.innermost_pure_loop_extent ;
26972736 intermediate.outer_parallelism = parallelism;
2698- intermediate.num_warps_per_block = num_warps ;
2737+ intermediate.num_warps_per_block = num_warps_per_block ;
26992738
2700- intermediate.num_threads_per_block = gpu_loop_info. thread_info -> num_threads ;
2739+ intermediate.num_threads_per_block = num_threads_per_block ;
27012740 intermediate.points_computed_per_thread = points_computed_per_thread;
27022741 }
27032742 }
0 commit comments