Skip to content

[X86] Merge BT with a matching BTR/BTS/BTC#193612

Draft
chfast wants to merge 1 commit intollvm:mainfrom
chfast:x86-merge-bt-with-btr-bts-btc
Draft

[X86] Merge BT with a matching BTR/BTS/BTC#193612
chfast wants to merge 1 commit intollvm:mainfrom
chfast:x86-merge-bt-with-btr-bts-btc

Conversation

@chfast
Copy link
Copy Markdown
Member

@chfast chfast commented Apr 22, 2026

Fixes #165291.

BTR/BTS/BTC set CF from the pre-operation bit value, so a subsequent BT on the same source and bit index produces the same CF and is redundant. We were emitting both.

define i1 @btr_eq_i32(ptr %word, i32 %position) nounwind {
  %ofs = and i32 %position, 31
  %bit = shl nuw i32 1, %ofs
  %mask = xor i32 %bit, -1
  %ld = load i32, ptr %word
  %test = and i32 %ld, %bit
  %res = and i32 %ld, %mask
  %cmp = icmp eq i32 %test, 0
  store i32 %res, ptr %word
  ret i1 %cmp
}

Before:

movl (%rdi), %eax
movl %eax, %ecx
btrl %esi, %ecx
btl  %esi, %eax
setae %al
movl %ecx, (%rdi)
retq

After:

movl (%rdi), %ecx
btrl %esi, %ecx
setae %al
movl %ecx, (%rdi)
retq

Approach

  • Add three flag-producing DAG nodes X86ISD::{BTR,BTS,BTC}_FLAG that model the register-register BTR/BTS/BTC as (res, EFLAGS) = op src, bitno.
  • Pattern-match them to the existing BTR/BTS/BTC *rr encodings.
  • Add a DAG combine in combineBT that fuses an X86ISD::BT with a sibling AND(Src, rotl -2, X) / OR(Src, shl 1, X) / XOR(Src, shl 1, X) on the same source into a single flag-producing node. The bit-position operand can differ from BT's by trunc/zext or by an and x, C that preserves the low log2(BW) bits (BT already masks those implicitly), so peek through those wrappers when matching.

llvm/test/CodeGen/X86/bittest-big-integer.ll was regenerated and shrinks by 78 CHECK lines; dozens of btl instructions collapsed into the preceding bit-modify. All 5442 X86 CodeGen tests pass.

@llvmbot
Copy link
Copy Markdown
Member

llvmbot commented Apr 22, 2026

@llvm/pr-subscribers-backend-x86

Author: Paweł Bylica (chfast)

Changes

Fixes #165291.

BTR/BTS/BTC set CF from the pre-operation bit value, so a subsequent BT on the same source and bit index produces the same CF and is redundant. We were emitting both.

define i1 @<!-- -->btr_eq_i32(ptr %word, i32 %position) nounwind {
  %ofs = and i32 %position, 31
  %bit = shl nuw i32 1, %ofs
  %mask = xor i32 %bit, -1
  %ld = load i32, ptr %word
  %test = and i32 %ld, %bit
  %res = and i32 %ld, %mask
  %cmp = icmp eq i32 %test, 0
  store i32 %res, ptr %word
  ret i1 %cmp
}

Before:

movl (%rdi), %eax
movl %eax, %ecx
btrl %esi, %ecx
btl  %esi, %eax
setae %al
movl %ecx, (%rdi)
retq

After:

movl (%rdi), %ecx
btrl %esi, %ecx
setae %al
movl %ecx, (%rdi)
retq

Approach

  • Add three flag-producing DAG nodes X86ISD::{BTR,BTS,BTC}_FLAG that model the register-register BTR/BTS/BTC as (res, EFLAGS) = op src, bitno.
  • Pattern-match them to the existing BTR/BTS/BTC *rr encodings.
  • Add a DAG combine in combineBT that fuses an X86ISD::BT with a sibling AND(Src, rotl -2, X) / OR(Src, shl 1, X) / XOR(Src, shl 1, X) on the same source into a single flag-producing node. The bit-position operand can differ from BT's by trunc/zext or by an and x, C that preserves the low log2(BW) bits (BT already masks those implicitly), so peek through those wrappers when matching.

llvm/test/CodeGen/X86/bittest-big-integer.ll was regenerated and shrinks by 78 CHECK lines; dozens of btl instructions collapsed into the preceding bit-modify. All 5442 X86 CodeGen tests pass.


Patch is 29.35 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/193612.diff

4 Files Affected:

  • (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+111)
  • (modified) llvm/lib/Target/X86/X86InstrCompiler.td (+17)
  • (modified) llvm/lib/Target/X86/X86InstrFragments.td (+7)
  • (modified) llvm/test/CodeGen/X86/bittest-big-integer.ll (+114-192)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5ed2eed2a0e8d..c4d096df62fba 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -56672,6 +56672,114 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Strip TRUNCATE/ZERO_EXTEND/ANY_EXTEND wrappers, plus any `and x, C` where
+// C preserves the low log2(BW) bits (BT/BTR/BTS/BTC mask the bit index
+// implicitly to log2(BW) bits).
+static SDValue peekThroughBitPosExtTrunc(SDValue V, unsigned BW) {
+  APInt LowMask =
+      APInt::getLowBitsSet(V.getScalarValueSizeInBits(), Log2_32(BW));
+  while (true) {
+    unsigned Op = V.getOpcode();
+    if (Op == ISD::TRUNCATE || Op == ISD::ZERO_EXTEND ||
+        Op == ISD::ANY_EXTEND) {
+      V = V.getOperand(0);
+      LowMask = LowMask.zextOrTrunc(V.getScalarValueSizeInBits());
+      continue;
+    }
+    if (Op == ISD::AND) {
+      if (auto *C = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
+        if ((C->getAPIntValue() & LowMask) == LowMask) {
+          V = V.getOperand(0);
+          continue;
+        }
+      }
+    }
+    break;
+  }
+  return V;
+}
+
+// Try to merge a (X86ISD::BT Src, BitNo) with a sibling bit-modifying op on
+// Src — AND(Src, rotl -2, X), OR(Src, shl 1, X), XOR(Src, shl 1, X) — into a
+// single flag-producing X86ISD::{BTR,BTS,BTC}_FLAG node. Both BT and
+// BTR/BTS/BTC set CF from the pre-op bit value, so one instruction subsumes
+// the other. Fixes llvm#165291.
+static SDValue combineBTToBitOpFlag(SDNode *N, SelectionDAG &DAG) {
+  SDValue Src = N->getOperand(0);
+  SDValue BitNo = N->getOperand(1);
+  EVT VT = Src.getValueType();
+  SDLoc DL(N);
+
+  // BT is only emitted for legal integer widths (16/32/64); match those.
+  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  unsigned BW = VT.getScalarSizeInBits();
+  SDValue PeeledBitNo = peekThroughBitPosExtTrunc(BitNo, BW);
+
+  for (SDNode *User : Src->users()) {
+    if (User == N)
+      continue;
+    unsigned UOpc = User->getOpcode();
+    if (UOpc != ISD::AND && UOpc != ISD::OR && UOpc != ISD::XOR)
+      continue;
+    if (User->getValueType(0) != VT)
+      continue;
+
+    // Identify which operand of User is Src; the other is the mask.
+    SDValue UOp0 = User->getOperand(0);
+    SDValue UOp1 = User->getOperand(1);
+    SDValue Mask;
+    if (UOp0 == SDValue(Src.getNode(), Src.getResNo()))
+      Mask = UOp1;
+    else if (UOp1 == SDValue(Src.getNode(), Src.getResNo()))
+      Mask = UOp0;
+    else
+      continue;
+    // We will replace the mask's consumer (User); require the mask to have no
+    // other live uses so we can drop it.
+    if (!Mask.hasOneUse())
+      continue;
+
+    unsigned FlagOp = 0;
+    SDValue ShAmt;
+    if (UOpc == ISD::AND && Mask.getOpcode() == ISD::ROTL) {
+      // (and Src, (rotl -2, X)) — clears bit X.
+      if (auto *C = dyn_cast<ConstantSDNode>(Mask.getOperand(0)))
+        if (C->getAPIntValue() == APInt::getAllOnes(BW) - 1) {
+          FlagOp = X86ISD::BTR_FLAG;
+          ShAmt = Mask.getOperand(1);
+        }
+    } else if ((UOpc == ISD::OR || UOpc == ISD::XOR) &&
+               Mask.getOpcode() == ISD::SHL) {
+      // (or/xor Src, (shl 1, X)) — sets / flips bit X.
+      if (auto *C = dyn_cast<ConstantSDNode>(Mask.getOperand(0)))
+        if (C->getAPIntValue() == 1) {
+          FlagOp = UOpc == ISD::OR ? X86ISD::BTS_FLAG : X86ISD::BTC_FLAG;
+          ShAmt = Mask.getOperand(1);
+        }
+    }
+    if (!FlagOp)
+      continue;
+
+    // The BT and the bit-op must address the same bit. They can differ only
+    // by truncation/extension or an AND that preserves the low log2(BW) bits.
+    if (peekThroughBitPosExtTrunc(ShAmt, BW) != PeeledBitNo)
+      continue;
+
+    // BTR/BTS/BTC *rr take the bit index in a register of the same width as
+    // the source. Extend or truncate to VT to match the instruction signature.
+    SDValue BN = DAG.getZExtOrTrunc(BitNo, DL, VT);
+    SDValue New =
+        DAG.getNode(FlagOp, DL, DAG.getVTList(VT, MVT::i32), Src, BN);
+    // Reroute the value output through User's consumers.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(User, 0), New.getValue(0));
+    // Return the flags output so combineBT installs it as N's replacement.
+    return New.getValue(1);
+  }
+  return SDValue();
+}
+
 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
                          TargetLowering::DAGCombinerInfo &DCI) {
   SDValue N1 = N->getOperand(1);
@@ -56685,6 +56793,9 @@ static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
     return SDValue(N, 0);
   }
 
+  if (SDValue V = combineBTToBitOpFlag(N, DAG))
+    return V;
+
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index ebbfa48d2660c..735bc57e8eaba 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -2072,6 +2072,23 @@ defm : OneBitPats<GR16, i16, BTR16rr, BTS16rr, BTC16rr, shiftMask16>;
 defm : OneBitPats<GR32, i32, BTR32rr, BTS32rr, BTC32rr, shiftMask32>;
 defm : OneBitPats<GR64, i64, BTR64rr, BTS64rr, BTC64rr, shiftMask64>;
 
+// Flag-producing variants: reuse the BTR/BTS/BTC encodings so CF can replace
+// a separate X86ISD::BT. Emitted by a DAG combine in combineBT.
+multiclass OneBitFlagPats<RegisterClass rc, Instruction btr, Instruction bts,
+                          Instruction btc, SDNode btr_flag, SDNode bts_flag,
+                          SDNode btc_flag> {
+  def : Pat<(btr_flag rc:$src1, rc:$src2), (btr rc:$src1, rc:$src2)>;
+  def : Pat<(bts_flag rc:$src1, rc:$src2), (bts rc:$src1, rc:$src2)>;
+  def : Pat<(btc_flag rc:$src1, rc:$src2), (btc rc:$src1, rc:$src2)>;
+}
+
+defm : OneBitFlagPats<GR16, BTR16rr, BTS16rr, BTC16rr,
+                      X86btr_flag, X86bts_flag, X86btc_flag>;
+defm : OneBitFlagPats<GR32, BTR32rr, BTS32rr, BTC32rr,
+                      X86btr_flag, X86bts_flag, X86btc_flag>;
+defm : OneBitFlagPats<GR64, BTR64rr, BTS64rr, BTC64rr,
+                      X86btr_flag, X86bts_flag, X86btc_flag>;
+
 //===----------------------------------------------------------------------===//
 // EFLAGS-defining Patterns
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index bae1211e51330..9c7832d5ba11a 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -165,6 +165,13 @@ let IsStrictFP = true in {
 // X86 bit-test instructions.
 def X86bt      : SDNode<"X86ISD::BT",       SDTX86CmpTest>;
 
+// X86 bit-test-and-modify instructions: res, EFLAGS = op src, bitno.
+// CF is set from the pre-operation bit value; BT on the same operands is
+// therefore redundant after one of these nodes.
+def X86btr_flag : SDNode<"X86ISD::BTR_FLAG", SDTBinaryArithWithFlags>;
+def X86bts_flag : SDNode<"X86ISD::BTS_FLAG", SDTBinaryArithWithFlags>;
+def X86btc_flag : SDNode<"X86ISD::BTC_FLAG", SDTBinaryArithWithFlags>;
+
 // Conditional compare instructions
 def X86ccmp    : SDNode<"X86ISD::CCMP",     SDTX86Ccmp>;
 def X86ctest   : SDNode<"X86ISD::CTEST",    SDTX86Ccmp>;
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 96ccc7b0f7527..6767cc45a2c5e 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -38,24 +38,18 @@ define i1 @test_eq_i32(ptr %word, i32 %position) nounwind {
 define i1 @complement_ne_i32(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: complement_ne_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    btcl %eax, %esi
-; X86-NEXT:    btl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    btcl %eax, %edx
 ; X86-NEXT:    setb %al
-; X86-NEXT:    movl %esi, (%ecx)
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: complement_ne_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    movl (%rdi), %ecx
 ; X64-NEXT:    btcl %esi, %ecx
-; X64-NEXT:    btl %esi, %eax
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    movl %ecx, (%rdi)
 ; X64-NEXT:    retq
@@ -72,24 +66,18 @@ define i1 @complement_ne_i32(ptr %word, i32 %position) nounwind {
 define i1 @reset_eq_i32(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: reset_eq_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    btrl %eax, %esi
-; X86-NEXT:    btl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    btrl %eax, %edx
 ; X86-NEXT:    setae %al
-; X86-NEXT:    movl %esi, (%ecx)
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: reset_eq_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    movl (%rdi), %ecx
 ; X64-NEXT:    btrl %esi, %ecx
-; X64-NEXT:    btl %esi, %eax
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movl %ecx, (%rdi)
 ; X64-NEXT:    retq
@@ -107,24 +95,18 @@ define i1 @reset_eq_i32(ptr %word, i32 %position) nounwind {
 define i1 @set_ne_i32(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: set_ne_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    btsl %eax, %esi
-; X86-NEXT:    btl %eax, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    btsl %eax, %edx
 ; X86-NEXT:    setb %al
-; X86-NEXT:    movl %esi, (%ecx)
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl %edx, (%ecx)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: set_ne_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl (%rdi), %eax
-; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    movl (%rdi), %ecx
 ; X64-NEXT:    btsl %esi, %ecx
-; X64-NEXT:    btl %esi, %eax
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    movl %ecx, (%rdi)
 ; X64-NEXT:    retq
@@ -145,14 +127,12 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movl (%edx), %esi
-; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl (%edx), %edi
 ; X86-NEXT:    btrl %ecx, %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    btl %ecx, %esi
 ; X86-NEXT:    setae %al
+; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    movl %edi, (%edx)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -162,24 +142,20 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movl %esi, %ecx
 ; SSE-NEXT:    shll %cl, %edx
-; SSE-NEXT:    movl (%rdi), %eax
-; SSE-NEXT:    movl %eax, %esi
+; SSE-NEXT:    movl (%rdi), %esi
 ; SSE-NEXT:    btrl %ecx, %esi
-; SSE-NEXT:    orl %edx, %esi
-; SSE-NEXT:    btl %ecx, %eax
 ; SSE-NEXT:    setae %al
+; SSE-NEXT:    orl %edx, %esi
 ; SSE-NEXT:    movl %esi, (%rdi)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: init_eq_i32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    shlxl %esi, %edx, %eax
-; AVX-NEXT:    movl (%rdi), %ecx
-; AVX-NEXT:    movl %ecx, %edx
+; AVX-NEXT:    shlxl %esi, %edx, %ecx
+; AVX-NEXT:    movl (%rdi), %edx
 ; AVX-NEXT:    btrl %esi, %edx
-; AVX-NEXT:    orl %eax, %edx
-; AVX-NEXT:    btl %esi, %ecx
 ; AVX-NEXT:    setae %al
+; AVX-NEXT:    orl %ecx, %edx
 ; AVX-NEXT:    movl %edx, (%rdi)
 ; AVX-NEXT:    retq
   %ofs = and i32 %position, 31
@@ -232,29 +208,24 @@ define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {
 define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: complement_ne_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $32, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $32, %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    movl (%ecx,%edx), %esi
+; X86-NEXT:    btcl %eax, %esi
 ; X86-NEXT:    setb %al
-; X86-NEXT:    btcl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
+; X86-NEXT:    movl %esi, (%ecx,%edx)
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: complement_ne_i64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq (%rdi), %rcx
 ; X64-NEXT:    btcq %rsi, %rcx
-; X64-NEXT:    btq %rsi, %rax
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    movq %rcx, (%rdi)
 ; X64-NEXT:    retq
@@ -272,29 +243,24 @@ define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
 define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: reset_eq_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $32, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $32, %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    movl (%ecx,%edx), %esi
+; X86-NEXT:    btrl %eax, %esi
 ; X86-NEXT:    setae %al
-; X86-NEXT:    btrl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
+; X86-NEXT:    movl %esi, (%ecx,%edx)
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: reset_eq_i64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq (%rdi), %rcx
 ; X64-NEXT:    btrq %rsi, %rcx
-; X64-NEXT:    btq %rsi, %rax
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    movq %rcx, (%rdi)
 ; X64-NEXT:    retq
@@ -313,29 +279,24 @@ define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
 define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: set_ne_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $32, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $32, %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    movl (%ecx,%edx), %esi
+; X86-NEXT:    btsl %eax, %esi
 ; X86-NEXT:    setb %al
-; X86-NEXT:    btsl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
+; X86-NEXT:    movl %esi, (%ecx,%edx)
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: set_ne_i64:
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    movq (%rdi), %rcx
 ; X64-NEXT:    btsq %rsi, %rcx
-; X64-NEXT:    btq %rsi, %rax
 ; X64-NEXT:    setb %al
 ; X64-NEXT:    movq %rcx, (%rdi)
 ; X64-NEXT:    retq
@@ -362,9 +323,8 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; X86-NEXT:    andl $32, %esi
 ; X86-NEXT:    shrl $3, %esi
 ; X86-NEXT:    movl (%edx,%esi), %edi
-; X86-NEXT:    btl %ecx, %edi
-; X86-NEXT:    setae %al
 ; X86-NEXT:    btrl %ecx, %edi
+; X86-NEXT:    setae %al
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shll %cl, %ebx
@@ -378,14 +338,12 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; SSE-LABEL: init_eq_i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl %edx, %eax
-; SSE-NEXT:    shlq %cl, %rax
-; SSE-NEXT:    movq (%rdi), %rdx
-; SSE-NEXT:    movq %rdx, %rsi
+; SSE-NEXT:    movl %edx, %edx
+; SSE-NEXT:    shlq %cl, %rdx
+; SSE-NEXT:    movq (%rdi), %rsi
 ; SSE-NEXT:    btrq %rcx, %rsi
-; SSE-NEXT:    orq %rax, %rsi
-; SSE-NEXT:    btq %rcx, %rdx
 ; SSE-NEXT:    setae %al
+; SSE-NEXT:    orq %rdx, %rsi
 ; SSE-NEXT:    movq %rsi, (%rdi)
 ; SSE-NEXT:    retq
 ;
@@ -393,13 +351,11 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
 ; AVX-NEXT:    movl %edx, %eax
-; AVX-NEXT:    shlxq %rsi, %rax, %rax
-; AVX-NEXT:    movq (%rdi), %rcx
-; AVX-NEXT:    movq %rcx, %rdx
+; AVX-NEXT:    shlxq %rsi, %rax, %rcx
+; AVX-NEXT:    movq (%rdi), %rdx
 ; AVX-NEXT:    btrq %rsi, %rdx
-; AVX-NEXT:    orq %rax, %rdx
-; AVX-NEXT:    btq %rsi, %rcx
 ; AVX-NEXT:    setae %al
+; AVX-NEXT:    orq %rcx, %rdx
 ; AVX-NEXT:    movq %rdx, (%rdi)
 ; AVX-NEXT:    retq
   %rem = and i32 %position, 63
@@ -455,20 +411,17 @@ define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {
 define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: complement_ne_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $96, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $96, %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    movl (%ecx,%edx), %esi
+; X86-NEXT:    btcl %eax, %esi
 ; X86-NEXT:    setb %al
-; X86-NEXT:    btcl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
+; X86-NEXT:    movl %esi, (%ecx,%edx)
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: complement_ne_i128:
@@ -477,9 +430,8 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
 ; X64-NEXT:    andl $96, %ecx
 ; X64-NEXT:    shrl $3, %ecx
 ; X64-NEXT:    movl (%rdi,%rcx), %edx
-; X64-NEXT:    btl %esi, %edx
-; X64-NEXT:    setb %al
 ; X64-NEXT:    btcl %esi, %edx
+; X64-NEXT:    setb %al
 ; X64-NEXT:    movl %edx, (%rdi,%rcx)
 ; X64-NEXT:    retq
   %rem = and i32 %position, 127
@@ -496,20 +448,17 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
 define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: reset_eq_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $96, %esi
-; X86-NEXT:    shrl $3, %esi
-; X86-NEXT:    movl (%ecx,%esi), %edi
-; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $96, %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    movl (%ecx,%edx), %esi
+; X86-NEXT:    btrl %eax, %esi
 ; X86-NEXT:    setae %al
-; X86-NEXT:    btrl %edx, %edi
-; X86-NEXT:    movl %edi, (%ecx,%esi)
+; X86-NEXT:    movl %esi, (%ecx,%edx)
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: reset_eq_i128:
@@ -518,9 +467,8 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
 ; X64-NEXT:    andl $96, %ecx
 ; X64-NEXT:    shrl $3, %ecx
 ; X64-NEXT:    movl (%rdi,%rcx), %edx
-; X64-NEXT:    btl %esi, %edx
-; X64-NEXT:    setae %al
 ; X64-NEXT:    btrl %esi, %edx
+; X64-NEXT:    setae %al
 ; X64-NEXT:    movl %edx, (%rdi,%rcx)
 ; X64-NEXT:    retq
   %rem = and i32 %position, 127
@@ -538,20 +486,17 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
 define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: set_ne_i128:
...
[truncated]

@github-actions
Copy link
Copy Markdown

github-actions Bot commented Apr 22, 2026

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:
git-clang-format --diff origin/main HEAD --extensions cpp -- llvm/lib/Target/X86/X86ISelLowering.cpp --diff_from_common_commit

⚠️
The reproduction instructions above might return results for more than one PR
in a stack if you are using a stacked PR workflow. You can limit the results by
changing origin/main to the base branch/commit you want to compare against.
⚠️

View the diff from clang-format here.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7f885ec49..2a0d570ae 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -56774,8 +56774,7 @@ static SDValue combineBTToBitOpFlag(SDNode *N, SelectionDAG &DAG) {
     // BTR/BTS/BTC *rr take the bit index in a register of the same width as
     // the source. Extend or truncate to VT to match the instruction signature.
     SDValue BN = DAG.getZExtOrTrunc(BitNo, DL, VT);
-    SDValue New =
-        DAG.getNode(FlagOp, DL, DAG.getVTList(VT, MVT::i32), Src, BN);
+    SDValue New = DAG.getNode(FlagOp, DL, DAG.getVTList(VT, MVT::i32), Src, BN);
     // Reroute the value output through User's consumers.
     DAG.ReplaceAllUsesOfValueWith(SDValue(User, 0), New.getValue(0));
     // Return the flags output so combineBT installs it as N's replacement.

@chfast chfast marked this pull request as draft April 23, 2026 06:10
BTR/BTS/BTC set CF from the pre-operation bit value, so a subsequent BT
on the same source and bit index produces redundant EFLAGS. We were
emitting both, e.g. for `(and ld, ~bit) | (and ld, bit) != 0` we got:

    btrl %esi, %ecx
    btl  %esi, %eax
    setae %al

Introduce three flag-producing DAG nodes X86ISD::BTR/BTS/BTC that model
the register-register BTR/BTS/BTC as `(res, EFLAGS) = op src, bitno`
(the atomic locked variants live under X86ISD::LBTR/LBTS/LBTC already),
pattern-match them to the existing BTR/BTS/BTC encodings, and add a DAG
combine in combineBT that fuses an X86ISD::BT with a sibling
AND(Src, rotl -2, X) / OR(Src, shl 1, X) / XOR(Src, shl 1, X) on the
same source into a single flag-producing node. The bit-position operand
can differ from BT's by trunc/zext/and-with-mask that preserves the low
log2(BW) bits (BT already masks those implicitly), so peek through
those wrappers when matching.

After the fix the example lowers to:

    btrl %esi, %ecx
    setae %al

Fixes llvm#165291.
@chfast chfast force-pushed the x86-merge-bt-with-btr-bts-btc branch from 184a3a3 to e9f8d75 Compare April 23, 2026 06:14
@RKSimon RKSimon self-requested a review April 23, 2026 06:41
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

[X86] Failure to merge BT with matching BTC/BTR/BTS

2 participants