AliceO2Group · davidrohr · Jan 26, 2021 · Jan 20, 2021
@@ -11,7 +11,6 @@
 #ifndef ALICEO2_TPC_CALARRAY_H_
 #define ALICEO2_TPC_CALARRAY_H_
 
-#include <Vc/Vc>
 #include <memory>
 #include <vector>
 #include <string>

@@ -355,7 +355,7 @@
 
 #define GPUCA_THREAD_COUNT_SCAN 512 // TODO: WARNING!!! Must not be GPUTYPE-dependent right now! // TODO: Fix!
 
-#define GPUCA_LB_GPUTPCCFCheckPadBaseline GPUCA_WARP_SIZE
+#define GPUCA_LB_GPUTPCCFCheckPadBaseline 64
 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap GPUCA_LB_CLUSTER_FINDER
 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits GPUCA_LB_CLUSTER_FINDER
 #define GPUCA_LB_GPUTPCCFChargeMapFiller_findFragmentStart GPUCA_LB_CLUSTER_FINDER

@@ -551,7 +551,7 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)
         checkForNoisyPads &= (rec()->GetParam().rec.noisyPadsQuickCheck ? fragment.index == 0 : true);
 
         if (checkForNoisyPads) {
-          int nBlocks = TPC_PADS_IN_SECTOR / GPUTPCCFCheckPadBaseline::getPadsPerBlock(doGPU);
+          int nBlocks = TPC_PADS_IN_SECTOR / GPUTPCCFCheckPadBaseline::PadsPerCacheline;
           runKernel<GPUTPCCFCheckPadBaseline>(GetGridBlk(nBlocks, lane), {iSlice}, {});
         }
 

@@ -96,7 +96,7 @@ struct GridSize<1> {
 template <>
 struct GridSize<2> {
   enum {
-    Width = 4,
+    Width = 8,
     Height = 4,
   };
 };

@@ -16,33 +16,45 @@
 #include "PackedCharge.h"
 #include "clusterFinderDefs.h"
 
+#ifndef GPUCA_GPUCODE
+#ifndef GPUCA_NO_VC
+#include <Vc/Vc>
+#else
+#include <array>
+#endif
+#endif
+
 using namespace GPUCA_NAMESPACE::gpu;
 using namespace GPUCA_NAMESPACE::gpu::tpccf;
 
 template <>
 GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer)
 {
-  static_assert(TPC_MAX_FRAGMENT_LEN % NumOfCachedTimebins == 0);
-
+  const CfFragment& fragment = clusterer.mPmemory->fragment;
   Array2D<PackedCharge> chargeMap(reinterpret_cast<PackedCharge*>(clusterer.mPchargeMap));
 
+  int basePad = iBlock * PadsPerCacheline;
+  ChargePos basePos = padToChargePos(basePad, clusterer);
+
+  if (not basePos.valid()) {
+    return;
+  }
+
+#ifdef GPUCA_GPUCODE
+  static_assert(TPC_MAX_FRAGMENT_LEN % NumOfCachedTimebins == 0);
+
   int totalCharges = 0;
   int consecCharges = 0;
   int maxConsecCharges = 0;
 
-  int localPadId = iThread / NumOfCachedTimebins;
-  int localTimeBin = iThread % NumOfCachedTimebins;
+  short localPadId = iThread / NumOfCachedTimebins;
+  short localTimeBin = iThread % NumOfCachedTimebins;
   bool handlePad = localTimeBin == 0;
-  int basePad = iBlock * PadsPerBlock;
-
-  CfFragment& fragment = clusterer.mPmemory->fragment;
 
-  ChargePos basePos = padToChargePos(basePad + localPadId, clusterer);
-
-  for (tpccf::TPCFragmentTime t = localTimeBin + fragment.firstNonOverlapTimeBin(); t < fragment.lastNonOverlapTimeBin(); t += NumOfCachedTimebins) {
-    ChargePos pos = basePos.delta({0, t});
+  for (tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin(); t < fragment.lastNonOverlapTimeBin(); t += NumOfCachedTimebins) {
+    ChargePos pos = basePos.delta({localPadId, short(t + localTimeBin)});
     smem.charges[localPadId][localTimeBin] = (pos.valid()) ? chargeMap[pos].unpack() : 0;
-    GPUbarrierWarp();
+    GPUbarrier();
     if (handlePad) {
       for (int i = 0; i < NumOfCachedTimebins; i++) {
         Charge q = smem.charges[localPadId][i];
@@ -53,28 +65,93 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int nBlocks, int nThreads, int i
     }
   }
 
-  GPUbarrierWarp();
+  GPUbarrier();
 
   if (handlePad) {
-    int totalChargesBaseline = clusterer.Param().rec.maxTimeBinAboveThresholdIn1000Bin * fragment.lengthWithoutOverlap() / 1000;
-    int consecChargesBaseline = clusterer.Param().rec.maxConsecTimeBinAboveThreshold;
-    bool hasLostBaseline = (totalChargesBaseline > 0 && totalCharges >= totalChargesBaseline) || (consecChargesBaseline > 0 && maxConsecCharges >= consecChargesBaseline);
-    clusterer.mPpadHasLostBaseline[basePad + localPadId] |= hasLostBaseline;
+    updatePadBaseline(basePad + localPadId, clusterer, totalCharges, maxConsecCharges);
+  }
+
+#else // CPU CODE
+
+  constexpr size_t ElemsInTileRow = TilingLayout<GridSize<2>>::WidthInTiles * TimebinsPerCacheline * PadsPerCacheline;
+
+#ifndef GPUCA_NO_VC
+  using UShort8 = Vc::fixed_size_simd<unsigned short, PadsPerCacheline>;
+
+  UShort8 totalCharges{Vc::Zero};
+  UShort8 consecCharges{Vc::Zero};
+  UShort8 maxConsecCharges{Vc::Zero};
+#else
+  std::array<unsigned short, PadsPerCacheline> totalCharges{0};
+  std::array<unsigned short, PadsPerCacheline> consecCharges{0};
+  std::array<unsigned short, PadsPerCacheline> maxConsecCharges{0};
+#endif
+
+  tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin();
+  const unsigned short* charge = reinterpret_cast<unsigned short*>(&chargeMap[basePos.delta({0, t})]);
+
+  for (; t < fragment.lastNonOverlapTimeBin(); t += TimebinsPerCacheline) {
+    for (tpccf::TPCFragmentTime localtime = 0; localtime < TimebinsPerCacheline; localtime++) {
+#ifndef GPUCA_NO_VC
+      UShort8 charges{charge + PadsPerCacheline * localtime, Vc::Aligned};
+
+      UShort8::mask_type isCharge = charges != 0;
+
+      if (isCharge.isNotEmpty()) {
+        totalCharges(isCharge)++;
+        consecCharges += 1;
+        consecCharges(not isCharge) = 0;
+        maxConsecCharges = Vc::max(consecCharges, maxConsecCharges);
+      } else {
+        consecCharges = 0;
+      }
+#else // Vc not available
+      for (tpccf::Pad localpad = 0; localpad < PadsPerCacheline; localpad++) {
+        bool isCharge = charge[PadsPerCacheline * localtime + localpad] != 0;
+        if (isCharge) {
+          totalCharges[localpad]++;
+          consecCharges[localpad]++;
+          maxConsecCharges[localpad] = CAMath::Max(maxConsecCharges[localpad], consecCharges[localpad]);
+        } else {
+          consecCharges[localpad] = 0;
+        }
+      }
+#endif
+    }
+
+    charge += ElemsInTileRow;
+  }
+
+  for (tpccf::Pad localpad = 0; localpad < PadsPerCacheline; localpad++) {
+    updatePadBaseline(basePad + localpad, clusterer, totalCharges[localpad], maxConsecCharges[localpad]);
   }
+#endif
 }
 
-GPUd() ChargePos GPUTPCCFCheckPadBaseline::padToChargePos(int pad, const GPUTPCClusterFinder& clusterer)
+GPUd() ChargePos GPUTPCCFCheckPadBaseline::padToChargePos(int& pad, const GPUTPCClusterFinder& clusterer)
 {
   const GPUTPCGeometry& geo = clusterer.Param().tpcGeometry;
 
   int padOffset = 0;
   for (Row r = 0; r < TPC_NUM_OF_ROWS; r++) {
+    int npads = geo.NPads(r);
     int padInRow = pad - padOffset;
-    if (0 <= padInRow && padInRow < geo.NPads(r)) {
-      return ChargePos{r, Pad(padInRow), 0};
+    if (0 <= padInRow && padInRow < CAMath::nextMultipleOf<PadsPerCacheline, int>(npads)) {
+      int cachelineOffset = padInRow % PadsPerCacheline;
+      pad -= cachelineOffset;
+      return ChargePos{r, Pad(padInRow - cachelineOffset), 0};
     }
-    padOffset += geo.NPads(r);
+    padOffset += npads;
   }
 
   return ChargePos{0, 0, INVALID_TIME_BIN};
 }
+
+GPUd() void GPUTPCCFCheckPadBaseline::updatePadBaseline(int pad, const GPUTPCClusterFinder& clusterer, int totalCharges, int consecCharges)
+{
+  const CfFragment& fragment = clusterer.mPmemory->fragment;
+  int totalChargesBaseline = clusterer.Param().rec.maxTimeBinAboveThresholdIn1000Bin * fragment.lengthWithoutOverlap() / 1000;
+  int consecChargesBaseline = clusterer.Param().rec.maxConsecTimeBinAboveThreshold;
+  bool hasLostBaseline = (totalChargesBaseline > 0 && totalCharges >= totalChargesBaseline) || (consecChargesBaseline > 0 && consecCharges >= consecChargesBaseline);
+  clusterer.mPpadHasLostBaseline[pad] |= hasLostBaseline;
+}
@@ -25,23 +25,15 @@ namespace GPUCA_NAMESPACE::gpu
 class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate
 {
 
- private:
-  // Only use these constants on device side...
-  // Use getPadsPerBlock() for host side
+ public:
   enum {
-    PadsPerBlockGPU = 4, // Number of pads in a single cache line
-    PadsPerBlockCPU = 1,
-#ifdef GPUCA_GPUCODE
-    PadsPerBlock = PadsPerBlockGPU,
-#else
-    PadsPerBlock = PadsPerBlockCPU,
-#endif
-    NumOfCachedTimebins = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFCheckPadBaseline) / PadsPerBlock,
+    PadsPerCacheline = 8,
+    TimebinsPerCacheline = 4,
+    NumOfCachedTimebins = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFCheckPadBaseline) / PadsPerCacheline,
   };
 
- public:
   struct GPUSharedMemory {
-    tpccf::Charge charges[PadsPerBlock][NumOfCachedTimebins];
+    tpccf::Charge charges[PadsPerCacheline][NumOfCachedTimebins];
   };
 
 #ifdef HAVE_O2HEADERS
@@ -57,17 +49,12 @@ class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate
     return GPUDataTypes::RecoStep::TPCClusterFinding;
   }
 
-  // Use this to get num of pads per block on host side. Can't use constant there.
-  static int getPadsPerBlock(bool isGPU)
-  {
-    return (isGPU) ? PadsPerBlockGPU : PadsPerBlockCPU;
-  }
-
   template <int iKernel = defaultKernel>
   GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer);
 
  private:
-  GPUd() static ChargePos padToChargePos(int pad, const GPUTPCClusterFinder&);
+  GPUd() static ChargePos padToChargePos(int& pad, const GPUTPCClusterFinder&);
+  GPUd() static void updatePadBaseline(int pad, const GPUTPCClusterFinder&, int totalCharges, int consecCharges);
 };
 
 } // namespace GPUCA_NAMESPACE::gpu

@@ -50,6 +50,7 @@ class PackedCharge
   GPUdi() tpccf::Charge unpack() const { return tpccf::Charge(mVal & ChargeMask) / tpccf::Charge(1 << DecimalBits); }
   GPUdi() bool has3x3Peak() const { return mVal & Has3x3PeakMask; }
   GPUdi() bool isSplit() const { return mVal & IsSplitMask; }
+  GPUdi() bool isZero() const { return mVal == 0; }
 
  private:
   BasicType mVal;

@@ -39,12 +39,15 @@ using ulong = unsigned long;
 #define SCRATCH_PAD_NOISE_N 16
 #endif
 
-#define PADDING_PAD 2
-#define PADDING_TIME 3
+// Padding of 2 and 3 respectively would be enough. But this ensures that
+// rows are always aligned along cache lines. Likewise for TPC_PADS_PER_ROW.
+#define PADDING_PAD 8
+#define PADDING_TIME 4
+#define TPC_PADS_PER_ROW 144
+
 #define TPC_SECTORS 36
 #define TPC_ROWS_PER_CRU 18
 #define TPC_NUM_OF_ROWS 152
-#define TPC_PADS_PER_ROW 138
 #define TPC_PADS_PER_ROW_PADDED (TPC_PADS_PER_ROW + PADDING_PAD)
 #define TPC_NUM_OF_PADS (TPC_NUM_OF_ROWS * TPC_PADS_PER_ROW_PADDED + PADDING_PAD)
 #define TPC_PADS_IN_SECTOR 14560