diff --git a/Detectors/TPC/base/include/TPCBase/CalArray.h b/Detectors/TPC/base/include/TPCBase/CalArray.h index 6cab619159606..af71fb503a1f1 100644 --- a/Detectors/TPC/base/include/TPCBase/CalArray.h +++ b/Detectors/TPC/base/include/TPCBase/CalArray.h @@ -11,7 +11,6 @@ #ifndef ALICEO2_TPC_CALARRAY_H_ #define ALICEO2_TPC_CALARRAY_H_ -#include #include #include #include diff --git a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h index c7bc9281e1c29..022d4b48b0f49 100644 --- a/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h +++ b/GPU/GPUTracking/Definitions/GPUDefGPUParameters.h @@ -355,7 +355,7 @@ #define GPUCA_THREAD_COUNT_SCAN 512 // TODO: WARNING!!! Must not be GPUTYPE-dependent right now! // TODO: Fix! -#define GPUCA_LB_GPUTPCCFCheckPadBaseline GPUCA_WARP_SIZE +#define GPUCA_LB_GPUTPCCFCheckPadBaseline 64 #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillIndexMap GPUCA_LB_CLUSTER_FINDER #define GPUCA_LB_GPUTPCCFChargeMapFiller_fillFromDigits GPUCA_LB_CLUSTER_FINDER #define GPUCA_LB_GPUTPCCFChargeMapFiller_findFragmentStart GPUCA_LB_CLUSTER_FINDER diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 0aee2ee3287a1..310919ebde790 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -551,7 +551,7 @@ int GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) checkForNoisyPads &= (rec()->GetParam().rec.noisyPadsQuickCheck ? fragment.index == 0 : true); if (checkForNoisyPads) { - int nBlocks = TPC_PADS_IN_SECTOR / GPUTPCCFCheckPadBaseline::getPadsPerBlock(doGPU); + int nBlocks = TPC_PADS_IN_SECTOR / GPUTPCCFCheckPadBaseline::PadsPerCacheline; runKernel(GetGridBlk(nBlocks, lane), {iSlice}, {}); } diff --git a/GPU/GPUTracking/TPCClusterFinder/Array2D.h b/GPU/GPUTracking/TPCClusterFinder/Array2D.h index 07964d4031c63..662393f5f72ac 100644 --- a/GPU/GPUTracking/TPCClusterFinder/Array2D.h +++ b/GPU/GPUTracking/TPCClusterFinder/Array2D.h @@ -96,7 +96,7 @@ struct GridSize<1> { template <> struct GridSize<2> { enum { - Width = 4, + Width = 8, Height = 4, }; }; diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx index d78ddf4a5e083..b4c3656841042 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.cxx @@ -16,33 +16,45 @@ #include "PackedCharge.h" #include "clusterFinderDefs.h" +#ifndef GPUCA_GPUCODE +#ifndef GPUCA_NO_VC +#include +#else +#include +#endif +#endif + using namespace GPUCA_NAMESPACE::gpu; using namespace GPUCA_NAMESPACE::gpu::tpccf; template <> GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer) { - static_assert(TPC_MAX_FRAGMENT_LEN % NumOfCachedTimebins == 0); - + const CfFragment& fragment = clusterer.mPmemory->fragment; Array2D chargeMap(reinterpret_cast(clusterer.mPchargeMap)); + int basePad = iBlock * PadsPerCacheline; + ChargePos basePos = padToChargePos(basePad, clusterer); + + if (not basePos.valid()) { + return; + } + +#ifdef GPUCA_GPUCODE + static_assert(TPC_MAX_FRAGMENT_LEN % NumOfCachedTimebins == 0); + int totalCharges = 0; int consecCharges = 0; int maxConsecCharges = 0; - int localPadId = iThread / NumOfCachedTimebins; - int localTimeBin = iThread % NumOfCachedTimebins; + short localPadId = iThread / NumOfCachedTimebins; + short localTimeBin = iThread % NumOfCachedTimebins; bool handlePad = localTimeBin == 0; - int basePad = iBlock * PadsPerBlock; - - CfFragment& fragment = clusterer.mPmemory->fragment; - ChargePos basePos = padToChargePos(basePad + localPadId, clusterer); - - for (tpccf::TPCFragmentTime t = localTimeBin + fragment.firstNonOverlapTimeBin(); t < fragment.lastNonOverlapTimeBin(); t += NumOfCachedTimebins) { - ChargePos pos = basePos.delta({0, t}); + for (tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin(); t < fragment.lastNonOverlapTimeBin(); t += NumOfCachedTimebins) { + ChargePos pos = basePos.delta({localPadId, short(t + localTimeBin)}); smem.charges[localPadId][localTimeBin] = (pos.valid()) ? chargeMap[pos].unpack() : 0; - GPUbarrierWarp(); + GPUbarrier(); if (handlePad) { for (int i = 0; i < NumOfCachedTimebins; i++) { Charge q = smem.charges[localPadId][i]; @@ -53,28 +65,93 @@ GPUd() void GPUTPCCFCheckPadBaseline::Thread<0>(int nBlocks, int nThreads, int i } } - GPUbarrierWarp(); + GPUbarrier(); if (handlePad) { - int totalChargesBaseline = clusterer.Param().rec.maxTimeBinAboveThresholdIn1000Bin * fragment.lengthWithoutOverlap() / 1000; - int consecChargesBaseline = clusterer.Param().rec.maxConsecTimeBinAboveThreshold; - bool hasLostBaseline = (totalChargesBaseline > 0 && totalCharges >= totalChargesBaseline) || (consecChargesBaseline > 0 && maxConsecCharges >= consecChargesBaseline); - clusterer.mPpadHasLostBaseline[basePad + localPadId] |= hasLostBaseline; + updatePadBaseline(basePad + localPadId, clusterer, totalCharges, maxConsecCharges); + } + +#else // CPU CODE + + constexpr size_t ElemsInTileRow = TilingLayout>::WidthInTiles * TimebinsPerCacheline * PadsPerCacheline; + +#ifndef GPUCA_NO_VC + using UShort8 = Vc::fixed_size_simd; + + UShort8 totalCharges{Vc::Zero}; + UShort8 consecCharges{Vc::Zero}; + UShort8 maxConsecCharges{Vc::Zero}; +#else + std::array totalCharges{0}; + std::array consecCharges{0}; + std::array maxConsecCharges{0}; +#endif + + tpccf::TPCFragmentTime t = fragment.firstNonOverlapTimeBin(); + const unsigned short* charge = reinterpret_cast(&chargeMap[basePos.delta({0, t})]); + + for (; t < fragment.lastNonOverlapTimeBin(); t += TimebinsPerCacheline) { + for (tpccf::TPCFragmentTime localtime = 0; localtime < TimebinsPerCacheline; localtime++) { +#ifndef GPUCA_NO_VC + UShort8 charges{charge + PadsPerCacheline * localtime, Vc::Aligned}; + + UShort8::mask_type isCharge = charges != 0; + + if (isCharge.isNotEmpty()) { + totalCharges(isCharge)++; + consecCharges += 1; + consecCharges(not isCharge) = 0; + maxConsecCharges = Vc::max(consecCharges, maxConsecCharges); + } else { + consecCharges = 0; + } +#else // Vc not available + for (tpccf::Pad localpad = 0; localpad < PadsPerCacheline; localpad++) { + bool isCharge = charge[PadsPerCacheline * localtime + localpad] != 0; + if (isCharge) { + totalCharges[localpad]++; + consecCharges[localpad]++; + maxConsecCharges[localpad] = CAMath::Max(maxConsecCharges[localpad], consecCharges[localpad]); + } else { + consecCharges[localpad] = 0; + } + } +#endif + } + + charge += ElemsInTileRow; + } + + for (tpccf::Pad localpad = 0; localpad < PadsPerCacheline; localpad++) { + updatePadBaseline(basePad + localpad, clusterer, totalCharges[localpad], maxConsecCharges[localpad]); } +#endif } -GPUd() ChargePos GPUTPCCFCheckPadBaseline::padToChargePos(int pad, const GPUTPCClusterFinder& clusterer) +GPUd() ChargePos GPUTPCCFCheckPadBaseline::padToChargePos(int& pad, const GPUTPCClusterFinder& clusterer) { const GPUTPCGeometry& geo = clusterer.Param().tpcGeometry; int padOffset = 0; for (Row r = 0; r < TPC_NUM_OF_ROWS; r++) { + int npads = geo.NPads(r); int padInRow = pad - padOffset; - if (0 <= padInRow && padInRow < geo.NPads(r)) { - return ChargePos{r, Pad(padInRow), 0}; + if (0 <= padInRow && padInRow < CAMath::nextMultipleOf(npads)) { + int cachelineOffset = padInRow % PadsPerCacheline; + pad -= cachelineOffset; + return ChargePos{r, Pad(padInRow - cachelineOffset), 0}; } - padOffset += geo.NPads(r); + padOffset += npads; } return ChargePos{0, 0, INVALID_TIME_BIN}; } + +GPUd() void GPUTPCCFCheckPadBaseline::updatePadBaseline(int pad, const GPUTPCClusterFinder& clusterer, int totalCharges, int consecCharges) +{ + const CfFragment& fragment = clusterer.mPmemory->fragment; + int totalChargesBaseline = clusterer.Param().rec.maxTimeBinAboveThresholdIn1000Bin * fragment.lengthWithoutOverlap() / 1000; + int consecChargesBaseline = clusterer.Param().rec.maxConsecTimeBinAboveThreshold; + bool hasLostBaseline = (totalChargesBaseline > 0 && totalCharges >= totalChargesBaseline) || (consecChargesBaseline > 0 && consecCharges >= consecChargesBaseline); + clusterer.mPpadHasLostBaseline[pad] |= hasLostBaseline; +} diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h index 0a0efee0c957c..039e64738765a 100644 --- a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h +++ b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h @@ -25,23 +25,15 @@ namespace GPUCA_NAMESPACE::gpu class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate { - private: - // Only use these constants on device side... - // Use getPadsPerBlock() for host side + public: enum { - PadsPerBlockGPU = 4, // Number of pads in a single cache line - PadsPerBlockCPU = 1, -#ifdef GPUCA_GPUCODE - PadsPerBlock = PadsPerBlockGPU, -#else - PadsPerBlock = PadsPerBlockCPU, -#endif - NumOfCachedTimebins = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFCheckPadBaseline) / PadsPerBlock, + PadsPerCacheline = 8, + TimebinsPerCacheline = 4, + NumOfCachedTimebins = GPUCA_GET_THREAD_COUNT(GPUCA_LB_GPUTPCCFCheckPadBaseline) / PadsPerCacheline, }; - public: struct GPUSharedMemory { - tpccf::Charge charges[PadsPerBlock][NumOfCachedTimebins]; + tpccf::Charge charges[PadsPerCacheline][NumOfCachedTimebins]; }; #ifdef HAVE_O2HEADERS @@ -57,17 +49,12 @@ class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate return GPUDataTypes::RecoStep::TPCClusterFinding; } - // Use this to get num of pads per block on host side. Can't use constant there. - static int getPadsPerBlock(bool isGPU) - { - return (isGPU) ? PadsPerBlockGPU : PadsPerBlockCPU; - } - template GPUd() static void Thread(int nBlocks, int nThreads, int iBlock, int iThread, GPUSharedMemory& smem, processorType& clusterer); private: - GPUd() static ChargePos padToChargePos(int pad, const GPUTPCClusterFinder&); + GPUd() static ChargePos padToChargePos(int& pad, const GPUTPCClusterFinder&); + GPUd() static void updatePadBaseline(int pad, const GPUTPCClusterFinder&, int totalCharges, int consecCharges); }; } // namespace GPUCA_NAMESPACE::gpu diff --git a/GPU/GPUTracking/TPCClusterFinder/PackedCharge.h b/GPU/GPUTracking/TPCClusterFinder/PackedCharge.h index e610508b1864b..8068478c00879 100644 --- a/GPU/GPUTracking/TPCClusterFinder/PackedCharge.h +++ b/GPU/GPUTracking/TPCClusterFinder/PackedCharge.h @@ -50,6 +50,7 @@ class PackedCharge GPUdi() tpccf::Charge unpack() const { return tpccf::Charge(mVal & ChargeMask) / tpccf::Charge(1 << DecimalBits); } GPUdi() bool has3x3Peak() const { return mVal & Has3x3PeakMask; } GPUdi() bool isSplit() const { return mVal & IsSplitMask; } + GPUdi() bool isZero() const { return mVal == 0; } private: BasicType mVal; diff --git a/GPU/GPUTracking/TPCClusterFinder/clusterFinderDefs.h b/GPU/GPUTracking/TPCClusterFinder/clusterFinderDefs.h index f5b7c233b0b91..d963b8468e1d2 100644 --- a/GPU/GPUTracking/TPCClusterFinder/clusterFinderDefs.h +++ b/GPU/GPUTracking/TPCClusterFinder/clusterFinderDefs.h @@ -39,12 +39,15 @@ using ulong = unsigned long; #define SCRATCH_PAD_NOISE_N 16 #endif -#define PADDING_PAD 2 -#define PADDING_TIME 3 +// Padding of 2 and 3 respectively would be enough. But this ensures that +// rows are always aligned along cache lines. Likewise for TPC_PADS_PER_ROW. +#define PADDING_PAD 8 +#define PADDING_TIME 4 +#define TPC_PADS_PER_ROW 144 + #define TPC_SECTORS 36 #define TPC_ROWS_PER_CRU 18 #define TPC_NUM_OF_ROWS 152 -#define TPC_PADS_PER_ROW 138 #define TPC_PADS_PER_ROW_PADDED (TPC_PADS_PER_ROW + PADDING_PAD) #define TPC_NUM_OF_PADS (TPC_NUM_OF_ROWS * TPC_PADS_PER_ROW_PADDED + PADDING_PAD) #define TPC_PADS_IN_SECTOR 14560