llvm · fhahn · Jun 18, 2025 · Jul 7, 2025 · Jul 7, 2025 · Jul 9, 2025
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -47,6 +47,9 @@ enum class RecurKind {
   FMul,     ///< Product of floats.
   FMin,     ///< FP min implemented in terms of select(cmp()).
   FMax,     ///< FP max implemented in terms of select(cmp()).
+  OrderedFCmpSelect, ///< FP max implemented in terms of select(cmp()), but
+                     /// without any fast-math flags. Users need to handle NaNs
+                     /// and signed zeros when generating code.
   FMinNum,  ///< FP min with llvm.minnum semantics including NaNs.
   FMaxNum,  ///< FP max with llvm.maxnum semantics including NaNs.
   FMinimum, ///< FP min with llvm.minimum semantics
@@ -252,9 +255,10 @@ class RecurrenceDescriptor {
   /// Returns true if the recurrence kind is a floating-point min/max kind.
   static bool isFPMinMaxRecurrenceKind(RecurKind Kind) {
     return Kind == RecurKind::FMin || Kind == RecurKind::FMax ||
-           Kind == RecurKind::FMinNum || Kind == RecurKind::FMaxNum ||
-           Kind == RecurKind::FMinimum || Kind == RecurKind::FMaximum ||
-           Kind == RecurKind::FMinimumNum || Kind == RecurKind::FMaximumNum;
+           Kind == RecurKind::OrderedFCmpSelect || Kind == RecurKind::FMinNum ||
+           Kind == RecurKind::FMaxNum || Kind == RecurKind::FMinimum ||
+           Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimumNum ||
+           Kind == RecurKind::FMaximumNum;
   }
 
   /// Returns true if the recurrence kind is any min/max kind.

diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -819,7 +819,8 @@ RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind,
   if (match(I, m_OrdOrUnordFMin(m_Value(), m_Value())))
     return InstDesc(Kind == RecurKind::FMin, I);
   if (match(I, m_OrdOrUnordFMax(m_Value(), m_Value())))
-    return InstDesc(Kind == RecurKind::FMax, I);
+    return InstDesc(
+        Kind == RecurKind::FMax || Kind == RecurKind::OrderedFCmpSelect, I);
   if (match(I, m_FMinNum(m_Value(), m_Value())))
     return InstDesc(Kind == RecurKind::FMin, I);
   if (match(I, m_FMaxNum(m_Value(), m_Value())))
@@ -962,6 +963,14 @@ RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(
                "unexpected recurrence kind for minnum");
         return InstDesc(I, RecurKind::FMinNum);
       }
+      if (Kind == RecurKind::FMax || Kind == RecurKind::OrderedFCmpSelect) {
+        if (isa<SelectInst>(I))
+          return InstDesc(I, RecurKind::OrderedFCmpSelect);
+        auto *Cmp = dyn_cast<FCmpInst>(I);
+        if (Cmp && FCmpInst::isOrdered(Cmp->getPredicate()) &&
+            isMinMaxPattern(I, Kind, Prev).isRecurrence())
+          return InstDesc(I, RecurKind::OrderedFCmpSelect);
+      }
       return InstDesc(false, I);
     }
     if (isFMulAddIntrinsic(I))
@@ -1227,6 +1236,7 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
   case RecurKind::UMin:
     return Instruction::ICmp;
   case RecurKind::FMax:
+  case RecurKind::OrderedFCmpSelect:
   case RecurKind::FMin:
   case RecurKind::FMaximum:
   case RecurKind::FMinimum:

diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -937,6 +937,7 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) {
     return Intrinsic::vector_reduce_umax;
   case RecurKind::UMin:
     return Intrinsic::vector_reduce_umin;
+  case RecurKind::OrderedFCmpSelect:
   case RecurKind::FMax:
   case RecurKind::FMaxNum:
     return Intrinsic::vector_reduce_fmax;
@@ -1088,6 +1089,7 @@ CmpInst::Predicate llvm::getMinMaxReductionPredicate(RecurKind RK) {
     return CmpInst::ICMP_SGT;
   case RecurKind::FMin:
     return CmpInst::FCMP_OLT;
+  case RecurKind::OrderedFCmpSelect:
   case RecurKind::FMax:
     return CmpInst::FCMP_OGT;
   // We do not add FMinimum/FMaximum recurrence kind here since there is no
@@ -1310,6 +1312,7 @@ Value *llvm::createSimpleReduction(IRBuilderBase &Builder, Value *Src,
   case RecurKind::SMin:
   case RecurKind::UMax:
   case RecurKind::UMin:
+  case RecurKind::OrderedFCmpSelect:
   case RecurKind::FMax:
   case RecurKind::FMin:
   case RecurKind::FMinNum:

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4405,13 +4405,15 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
 
 bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
     ElementCount VF) const {
-  // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
-  // reductions need special handling and are currently unsupported.
+  // Cross iteration phis such as fixed-order recurrences and
+  // OrderedFCmpSelect/FMaxNum/FMinNum reductions need special handling and are
+  // currently unsupported.
   if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) {
         if (!Legal->isReductionVariable(&Phi))
           return Legal->isFixedOrderRecurrence(&Phi);
         RecurKind RK = Legal->getRecurrenceDescriptor(&Phi).getRecurrenceKind();
-        return RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum;
+        return RK == RecurKind::OrderedFCmpSelect || RK == RecurKind::FMinNum ||
+               RK == RecurKind::FMaxNum;
       }))
     return false;
 
@@ -8847,11 +8849,11 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
 
   // Adjust the recipes for any inloop reductions.
   adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
-
-  // Apply mandatory transformation to handle FP maxnum/minnum reduction with
-  // NaNs if possible, bail out otherwise.
-  if (!VPlanTransforms::runPass(VPlanTransforms::handleMaxMinNumReductions,
-                                *Plan))
+  // Apply mandatory transformation to handle FP maxnum/minnum/OrderedFCmpSelect
+  // reduction with NaNs and signed-zeros if possible, bail out otherwise.
+  if (!VPlanTransforms::runPass(
+          VPlanTransforms::handleMaxMinNumAndOrderedFCmpSelectReductions,
+          *Plan))
     return nullptr;
 
   // Transform recipes to abstract recipes if it is legal and beneficial and

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -23705,6 +23705,7 @@ class HorizontalReduction {
         case RecurKind::FindFirstIVUMin:
         case RecurKind::FindLastIVSMax:
         case RecurKind::FindLastIVUMax:
+        case RecurKind::OrderedFCmpSelect:
         case RecurKind::FMaxNum:
         case RecurKind::FMinNum:
         case RecurKind::FMaximumNum:
@@ -23844,6 +23845,7 @@ class HorizontalReduction {
     case RecurKind::FindFirstIVUMin:
     case RecurKind::FindLastIVSMax:
     case RecurKind::FindLastIVUMax:
+    case RecurKind::OrderedFCmpSelect:
     case RecurKind::FMaxNum:
     case RecurKind::FMinNum:
     case RecurKind::FMaximumNum:
@@ -23948,6 +23950,7 @@ class HorizontalReduction {
     case RecurKind::FindFirstIVUMin:
     case RecurKind::FindLastIVSMax:
     case RecurKind::FindLastIVUMax:
+    case RecurKind::OrderedFCmpSelect:
     case RecurKind::FMaxNum:
     case RecurKind::FMinNum:
     case RecurKind::FMaximumNum:

diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -654,7 +654,105 @@ void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond,
   }
 }
 
-bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
+static bool handleOrderedFCmpSelect(VPlan &Plan,
+                                    VPReductionPHIRecipe *RedPhiR) {
+  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
+  VPWidenIntOrFpInductionRecipe *WideIV = nullptr;
+
+  // MaxOp feeding the reduction phi must be a select (either wide or a
+  // replicate recipe), where the phi is the last operand, and the compare
+  // predicate is strict. This ensures NaNs won't get propagated unless the
+  // initial value is NaN
+  auto *MaxOp = dyn_cast<VPRecipeWithIRFlags>(
+      RedPhiR->getBackedgeValue()->getDefiningRecipe());
+  if (!MaxOp)
+    return false;
+  auto *RepR = dyn_cast<VPReplicateRecipe>(MaxOp);
+  if (!isa<VPWidenSelectRecipe>(MaxOp) &&
+      !(RepR && (isa<SelectInst>(RepR->getUnderlyingInstr()))))
+    return false;
+
+  auto *Cmp = cast<VPRecipeWithIRFlags>(MaxOp->getOperand(0));
+  if (MaxOp->getOperand(1) == RedPhiR ||
+      !CmpInst::isStrictPredicate(Cmp->getPredicate()))
+    return false;
+
+  for (auto &R : LoopRegion->getEntryBasicBlock()->phis()) {
+    // We need a wide canonical IV
+    if (auto *CurIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
+      if (CurIV->isCanonical()) {
+        WideIV = CurIV;
+        break;
+      }
+    }
+  }
+
+  // A wide canonical IV is currently required.
+  // TODO: Create an induction if no suitable existing one is available.
+  if (!WideIV)
+    return false;
+
+  // Create a reduction that tracks the first indices where the latest maximum
+  // value has been selected. This is later used to select the max value from
+  // the partial reductions in a way that correctly handles signed zeros and
+  // NaNs in the input.
+  // Note that we do not need to check if the induction may hit the sentinel
+  // value. If the sentinel value gets hit, the final reduction value is at the
+  // last index or the maximum was never set and all lanes contain the start
+  // value. In either case, the correct value is selected.
+  unsigned IVWidth =
+      VPTypeAnalysis(Plan).inferScalarType(WideIV)->getScalarSizeInBits();
+  LLVMContext &Ctx = Plan.getScalarHeader()->getIRBasicBlock()->getContext();
+  VPValue *UMinSentinel =
+      Plan.getOrAddLiveIn(ConstantInt::get(Ctx, APInt::getMaxValue(IVWidth)));
+  auto *IdxPhi = new VPReductionPHIRecipe(nullptr, RecurKind::FindFirstIVUMin,
+                                          *UMinSentinel, false, false, 1);
+  IdxPhi->insertBefore(RedPhiR);
+  auto *MinIdxSel = new VPInstruction(Instruction::Select,
+                                      {MaxOp->getOperand(0), WideIV, IdxPhi});
+  MinIdxSel->insertAfter(MaxOp);
+  IdxPhi->addOperand(MinIdxSel);
+
+  // Find the first index holding with the maximum value. This is used to
+  // extract the lane with the final max value and is needed to handle signed
+  // zeros and NaNs in the input.
+  auto *MaxResult = find_singleton<VPSingleDefRecipe>(
+      RedPhiR->users(), [](VPUser *U, bool) -> VPSingleDefRecipe * {
+        auto *VPI = dyn_cast<VPInstruction>(U);
+        if (VPI && VPI->getOpcode() == VPInstruction::ComputeReductionResult)
+          return VPI;
+        return nullptr;
+      });
+  VPBuilder Builder(MaxResult->getParent(),
+                    std::next(MaxResult->getIterator()));
+
+  // Create mask for lanes that have the max value and use it to mask out
+  // indices that don't contain maximum values.
+  auto *MaskFinalMaxValue = Builder.createNaryOp(
+      Instruction::FCmp, {MaxResult->getOperand(1), MaxResult},
+      VPIRFlags(CmpInst::FCMP_OEQ));
+  auto *IndicesWithMaxValue = Builder.createNaryOp(
+      Instruction::Select, {MaskFinalMaxValue, MinIdxSel, UMinSentinel});
+  auto *FirstMaxIdx = Builder.createNaryOp(
+      VPInstruction::ComputeFindIVResult,
+      {IdxPhi, WideIV->getStartValue(), UMinSentinel, IndicesWithMaxValue});
+  // Convert the index of the first max value to an index in the vector lanes of
+  // the partial reduction results. This ensures we select the first max value
+  // and acts as a tie-breaker if the partial reductions contain signed zeros.
+  auto *FirstMaxLane =
+      Builder.createNaryOp(Instruction::URem, {FirstMaxIdx, &Plan.getVFxUF()});
+
+  // Extract the final max value and update the users.
+  auto *Res = Builder.createNaryOp(VPInstruction::ExtractLane,
+                                   {FirstMaxLane, MaxResult->getOperand(1)});
+  MaxResult->replaceUsesWithIf(Res, [MaskFinalMaxValue](VPUser &U, unsigned) {
+    return &U != MaskFinalMaxValue;
+  });
+  return true;
+}
+
+bool VPlanTransforms::handleMaxMinNumAndOrderedFCmpSelectReductions(
+    VPlan &Plan) {
   auto GetMinMaxCompareValue = [](VPReductionPHIRecipe *RedPhiR) -> VPValue * {
     auto *MinMaxR = dyn_cast<VPRecipeWithIRFlags>(
         RedPhiR->getBackedgeValue()->getDefiningRecipe());
@@ -703,7 +801,8 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
     if (RedPhiR)
       return false;
     if (Cur->getRecurrenceKind() != RecurKind::FMaxNum &&
-        Cur->getRecurrenceKind() != RecurKind::FMinNum) {
+        Cur->getRecurrenceKind() != RecurKind::FMinNum &&
+        Cur->getRecurrenceKind() != RecurKind::OrderedFCmpSelect) {
       HasUnsupportedPhi = true;
       continue;
     }
@@ -713,6 +812,15 @@ bool VPlanTransforms::handleMaxMinNumReductions(VPlan &Plan) {
   if (!RedPhiR)
     return true;
 
+  if (HasUnsupportedPhi)
+    return false;
+
+  if (RedPhiR->getRecurrenceKind() == RecurKind::OrderedFCmpSelect)
+    return handleOrderedFCmpSelect(Plan, RedPhiR);
+
+  // Try to update the vector loop to exit early if any input is NaN and resume
+  // executing in the scalar loop to handle the NaNs there.
+
   // We won't be able to resume execution in the scalar tail, if there are
   // unsupported header phis or there is no scalar tail at all, due to
   // tail-folding.

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -508,6 +508,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
     return true;
   switch (Opcode) {
   case Instruction::Freeze:
+  case Instruction::FCmp:
   case Instruction::ICmp:
   case Instruction::PHI:
   case Instruction::Select:
@@ -599,7 +600,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
     llvm_unreachable("should be handled by VPPhi::execute");
   }
   case Instruction::Select: {
-    bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
+    bool OnlyFirstLaneUsed =
+        State.VF.isScalar() || vputils::onlyFirstLaneUsed(this);
     Value *Cond = State.get(getOperand(0), OnlyFirstLaneUsed);
     Value *Op1 = State.get(getOperand(1), OnlyFirstLaneUsed);
     Value *Op2 = State.get(getOperand(2), OnlyFirstLaneUsed);
@@ -1015,7 +1017,8 @@ bool VPInstruction::isVectorToScalar() const {
          getOpcode() == VPInstruction::ComputeAnyOfResult ||
          getOpcode() == VPInstruction::ComputeFindIVResult ||
          getOpcode() == VPInstruction::ComputeReductionResult ||
-         getOpcode() == VPInstruction::AnyOf;
+         getOpcode() == VPInstruction::AnyOf ||
+         getOpcode() == VPInstruction::ExtractLane;
 }
 
 bool VPInstruction::isSingleScalar() const {

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -103,11 +103,10 @@ struct VPlanTransforms {
   /// not valid.
   static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder);
 
-  /// Check if \p Plan contains any FMaxNum or FMinNum reductions. If they do,
-  /// try to update the vector loop to exit early if any input is NaN and resume
-  /// executing in the scalar loop to handle the NaNs there. Return false if
-  /// this attempt was unsuccessful.
-  static bool handleMaxMinNumReductions(VPlan &Plan);
+  /// Check if \p Plan contains any FMaxNum, FMinNum or reductions. If they do,
+  /// try to update the vector loop to account for NaNs and signed zeros as
+  /// needed.
+  static bool handleMaxMinNumAndOrderedFCmpSelectReductions(VPlan &Plan);
 
   /// Clear NSW/NUW flags from reduction instructions if necessary.
   static void clearReductionWrapFlags(VPlan &Plan);