Various cleanup and consolidation.

rchen20 · rchen20 · commit 31cd85cdf82a · 2025-03-28T12:37:36.000-07:00
diff --git a/src/apps/FEMSWEEP.cpp b/src/apps/FEMSWEEP.cpp
@@ -14,6 +14,7 @@
 #include "common/DataUtils.hpp"
 
 #include <algorithm>
+#include <cmath>
 
 namespace rajaperf
 {
@@ -24,9 +25,9 @@ namespace apps
 FEMSWEEP::FEMSWEEP(const RunParams& params)
   : KernelBase(rajaperf::Apps_FEMSWEEP, params)
 {
-  m_ne = params.getFemSweepNumE();
-  m_na = params.getFemSweepNumA();
-  m_ng = params.getFemSweepNumG();
+  m_ne = 15 * 15 * 15;
+  m_na = 72;
+  m_ng = 128;
 
   setDefaultProblemSize(ND * m_ne * m_ng * m_na);
   setDefaultReps(1);
@@ -54,15 +55,15 @@ FEMSWEEP::FEMSWEEP(const RunParams& params)
   setBytesAtomicModifyWrittenPerRep( 0 );
 
   // This is an estimate of the upper bound FLOPs.
-  setFLOPsPerRep( (ND * ND * (ND-1) * 3 * 2 + // L & U formation
-                  ND * (ND-1) * 3 +             // forward substitution
-                  ND * (ND-1) * 3) *            // backward substitution
-                  m_ne +                            // matrix solve performed per element
-                  m_ne * NLF * FDS);                // coupling between sides of faces
+  setFLOPsPerRep( (ND * ND * (ND-1) * 3 * 2 +       // L & U formation
+                  ND * (ND-1) * 3 +                 // forward substitution
+                  ND * (ND-1) * 3 +                 // backward substitution
+                  NLF * FDS - pow(m_ne, 2/3) * 6) * // coupling between sides of faces
+                  m_ne * m_na * m_ng );             // for all elements, angles, and groups
 
   checksum_scale_factor = 1.0;
 
-  setComplexity(Complexity::N_to_the_four);
+  setComplexity(Complexity::N);
 
   setUsesFeature(Launch);
   //setUsesFeature(View);
@@ -95,27 +96,16 @@ void FEMSWEEP::setUp(VariantID vid, size_t RAJAPERF_UNUSED_ARG(tune_idx))
 
   // Some of the constants are properties of the mesh.
   // Will need to derive these when mesh generator is available.
-  allocData(m_nhpaa_r, m_na       , vid);
-  allocData(m_ohpaa_r, m_na       , vid);
-  allocData(m_phpaa_r, m_na * 43  , vid);
-  allocData(m_order_r, m_na * m_ne, vid);
-
-  allocData(m_AngleElem2FaceType, NLF * m_ne * m_na , vid);
-  allocData(m_elem_to_faces     , NLF * m_ne        , vid);
-  allocData(m_F_g2l             , 10800             , vid);
-  allocData(m_idx1              , 37800             , vid);
-  allocData(m_idx2              , 37800             , vid);
-
-  copyDataH2Space(m_nhpaa_r, g_nhpaa_r, m_na        , vid);
-  copyDataH2Space(m_ohpaa_r, g_ohpaa_r, m_na        , vid);
-  copyDataH2Space(m_phpaa_r, g_phpaa_r, m_na * 43   , vid);
-  copyDataH2Space(m_order_r, g_order_r, m_na * m_ne , vid);
-
-  copyDataH2Space(m_AngleElem2FaceType, g_AngleElem2FaceType, NLF * m_ne * m_na , vid);
-  copyDataH2Space(m_elem_to_faces     , g_elem_to_faces     , NLF * m_ne        , vid);
-  copyDataH2Space(m_F_g2l             , g_F_g2l             , 10800             , vid);
-  copyDataH2Space(m_idx1              , g_idx1              , 37800             , vid);
-  copyDataH2Space(m_idx2              , g_idx2              , 37800             , vid);
+  allocAndCopyHostData(m_nhpaa_r, g_nhpaa_r, m_na       , vid);
+  allocAndCopyHostData(m_ohpaa_r, g_ohpaa_r, m_na       , vid);
+  allocAndCopyHostData(m_phpaa_r, g_phpaa_r, m_na * 43  , vid);
+  allocAndCopyHostData(m_order_r, g_order_r, m_na * m_ne, vid);
+
+  allocAndCopyHostData(m_AngleElem2FaceType, g_AngleElem2FaceType, NLF * m_ne * m_na , vid);
+  allocAndCopyHostData(m_elem_to_faces     , g_elem_to_faces     , NLF * m_ne        , vid);
+  allocAndCopyHostData(m_F_g2l             , g_F_g2l             , 10800             , vid);
+  allocAndCopyHostData(m_idx1              , g_idx1              , 37800             , vid);
+  allocAndCopyHostData(m_idx2              , g_idx2              , 37800             , vid);
 }
 
 void FEMSWEEP::updateChecksum(VariantID vid, size_t tune_idx)
diff --git a/src/apps/FEMSWEEP.hpp b/src/apps/FEMSWEEP.hpp
@@ -19,7 +19,7 @@
 ///   double A[ND * ND], b[ND];
 ///   // This factor helps maintain stability in the solution of the matrix solve
 ///   // by eliminating the perturbation of the right-hand side.
-///   double Ffactor = std::max(std::sin(Adat[order_r[a*ne]*ND*ND + a*ne*ND*ND]) - 2.0, 0.0); \
+///   double Ffactor = fmax(sin(Adat[order_r[a*ne]*ND*ND + a*ne*ND*ND]) - 2.0, 0.0); \
 ///   for (int hp = 0; hp < nhp; ++hp) // loop over hyperplanes
 ///   {
 ///      // number of element in this hyperplane
@@ -112,13 +112,14 @@ constexpr int FDS = 4;  // number of DOFs per face
   Index_ptr F_g2l              = m_F_g2l             ; \
   Index_ptr idx1               = m_idx1              ; \
   Index_ptr idx2               = m_idx2              ; \
-  
+
+ 
 #define FEMSWEEP_KERNEL \
   const int a = ag / ng, g = ag % ng; \
   const int nhp = nhpaa_r[a], ohp = ohpaa_r[a]; \
   int s_nehp_done = 0; \
   double A[ND * ND], b[ND]; \
-  double Ffactor = std::max(std::sin(Adat[order_r[a*ne]*ND*ND + a*ne*ND*ND]) - 2.0, 0.0); \
+  double Ffactor = fmax(sin(Adat[order_r[a*ne]*ND*ND + a*ne*ND*ND]) - 2.0, 0.0); \
   for (int hp = 0; hp < nhp; ++hp) \
   { \
      const int nehp = phpaa_r[ohp + hp]; \
@@ -293,7 +294,11 @@ class FEMSWEEP : public KernelBase
   void runHipVariantImpl(VariantID vid);
 
 private:
+#if defined(RAJA_ENABLE_HIP)
+  static const size_t default_gpu_block_size = 64;
+#else
   static const size_t default_gpu_block_size = 128;
+#endif
   using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size,
                                                          integer::MultipleOf<32>>;
 
@@ -336,6 +341,18 @@ class FEMSWEEP : public KernelBase
   Index_type m_F_g2llen;
   Index_type m_idx1len;
   Index_type m_idx2len;
+
+  // Mesh data
+  static Index_type g_nhpaa_r[72];
+  static Index_type g_ohpaa_r[72];
+  static Index_type g_phpaa_r[3096];
+  static Index_type g_order_r[243000];
+
+  static Index_type g_AngleElem2FaceType[1458000];
+  static Index_type g_elem_to_faces[20250]     ;
+  static Index_type g_F_g2l[10800]             ;
+  static Index_type g_idx1[37800]              ;
+  static Index_type g_idx2[37800]              ;
 };
 
 } // end namespace apps