[X86] Merge BT with a matching BTR/BTS/BTC#193612
Conversation
|
@llvm/pr-subscribers-backend-x86 Author: Paweł Bylica (chfast) ChangesFixes #165291. BTR/BTS/BTC set CF from the pre-operation bit value, so a subsequent BT on the same source and bit index produces the same CF and is redundant. We were emitting both. define i1 @<!-- -->btr_eq_i32(ptr %word, i32 %position) nounwind {
%ofs = and i32 %position, 31
%bit = shl nuw i32 1, %ofs
%mask = xor i32 %bit, -1
%ld = load i32, ptr %word
%test = and i32 %ld, %bit
%res = and i32 %ld, %mask
%cmp = icmp eq i32 %test, 0
store i32 %res, ptr %word
ret i1 %cmp
}Before: movl (%rdi), %eax
movl %eax, %ecx
btrl %esi, %ecx
btl %esi, %eax
setae %al
movl %ecx, (%rdi)
retqAfter: movl (%rdi), %ecx
btrl %esi, %ecx
setae %al
movl %ecx, (%rdi)
retqApproach
Patch is 29.35 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/193612.diff 4 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5ed2eed2a0e8d..c4d096df62fba 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -56672,6 +56672,114 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Strip TRUNCATE/ZERO_EXTEND/ANY_EXTEND wrappers, plus any `and x, C` where
+// C preserves the low log2(BW) bits (BT/BTR/BTS/BTC mask the bit index
+// implicitly to log2(BW) bits).
+static SDValue peekThroughBitPosExtTrunc(SDValue V, unsigned BW) {
+ APInt LowMask =
+ APInt::getLowBitsSet(V.getScalarValueSizeInBits(), Log2_32(BW));
+ while (true) {
+ unsigned Op = V.getOpcode();
+ if (Op == ISD::TRUNCATE || Op == ISD::ZERO_EXTEND ||
+ Op == ISD::ANY_EXTEND) {
+ V = V.getOperand(0);
+ LowMask = LowMask.zextOrTrunc(V.getScalarValueSizeInBits());
+ continue;
+ }
+ if (Op == ISD::AND) {
+ if (auto *C = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
+ if ((C->getAPIntValue() & LowMask) == LowMask) {
+ V = V.getOperand(0);
+ continue;
+ }
+ }
+ }
+ break;
+ }
+ return V;
+}
+
+// Try to merge a (X86ISD::BT Src, BitNo) with a sibling bit-modifying op on
+// Src — AND(Src, rotl -2, X), OR(Src, shl 1, X), XOR(Src, shl 1, X) — into a
+// single flag-producing X86ISD::{BTR,BTS,BTC}_FLAG node. Both BT and
+// BTR/BTS/BTC set CF from the pre-op bit value, so one instruction subsumes
+// the other. Fixes llvm#165291.
+static SDValue combineBTToBitOpFlag(SDNode *N, SelectionDAG &DAG) {
+ SDValue Src = N->getOperand(0);
+ SDValue BitNo = N->getOperand(1);
+ EVT VT = Src.getValueType();
+ SDLoc DL(N);
+
+ // BT is only emitted for legal integer widths (16/32/64); match those.
+ if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
+ return SDValue();
+
+ unsigned BW = VT.getScalarSizeInBits();
+ SDValue PeeledBitNo = peekThroughBitPosExtTrunc(BitNo, BW);
+
+ for (SDNode *User : Src->users()) {
+ if (User == N)
+ continue;
+ unsigned UOpc = User->getOpcode();
+ if (UOpc != ISD::AND && UOpc != ISD::OR && UOpc != ISD::XOR)
+ continue;
+ if (User->getValueType(0) != VT)
+ continue;
+
+ // Identify which operand of User is Src; the other is the mask.
+ SDValue UOp0 = User->getOperand(0);
+ SDValue UOp1 = User->getOperand(1);
+ SDValue Mask;
+ if (UOp0 == SDValue(Src.getNode(), Src.getResNo()))
+ Mask = UOp1;
+ else if (UOp1 == SDValue(Src.getNode(), Src.getResNo()))
+ Mask = UOp0;
+ else
+ continue;
+ // We will replace the mask's consumer (User); require the mask to have no
+ // other live uses so we can drop it.
+ if (!Mask.hasOneUse())
+ continue;
+
+ unsigned FlagOp = 0;
+ SDValue ShAmt;
+ if (UOpc == ISD::AND && Mask.getOpcode() == ISD::ROTL) {
+ // (and Src, (rotl -2, X)) — clears bit X.
+ if (auto *C = dyn_cast<ConstantSDNode>(Mask.getOperand(0)))
+ if (C->getAPIntValue() == APInt::getAllOnes(BW) - 1) {
+ FlagOp = X86ISD::BTR_FLAG;
+ ShAmt = Mask.getOperand(1);
+ }
+ } else if ((UOpc == ISD::OR || UOpc == ISD::XOR) &&
+ Mask.getOpcode() == ISD::SHL) {
+ // (or/xor Src, (shl 1, X)) — sets / flips bit X.
+ if (auto *C = dyn_cast<ConstantSDNode>(Mask.getOperand(0)))
+ if (C->getAPIntValue() == 1) {
+ FlagOp = UOpc == ISD::OR ? X86ISD::BTS_FLAG : X86ISD::BTC_FLAG;
+ ShAmt = Mask.getOperand(1);
+ }
+ }
+ if (!FlagOp)
+ continue;
+
+ // The BT and the bit-op must address the same bit. They can differ only
+ // by truncation/extension or an AND that preserves the low log2(BW) bits.
+ if (peekThroughBitPosExtTrunc(ShAmt, BW) != PeeledBitNo)
+ continue;
+
+ // BTR/BTS/BTC *rr take the bit index in a register of the same width as
+ // the source. Extend or truncate to VT to match the instruction signature.
+ SDValue BN = DAG.getZExtOrTrunc(BitNo, DL, VT);
+ SDValue New =
+ DAG.getNode(FlagOp, DL, DAG.getVTList(VT, MVT::i32), Src, BN);
+ // Reroute the value output through User's consumers.
+ DAG.ReplaceAllUsesOfValueWith(SDValue(User, 0), New.getValue(0));
+ // Return the flags output so combineBT installs it as N's replacement.
+ return New.getValue(1);
+ }
+ return SDValue();
+}
+
static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
SDValue N1 = N->getOperand(1);
@@ -56685,6 +56793,9 @@ static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
return SDValue(N, 0);
}
+ if (SDValue V = combineBTToBitOpFlag(N, DAG))
+ return V;
+
return SDValue();
}
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index ebbfa48d2660c..735bc57e8eaba 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -2072,6 +2072,23 @@ defm : OneBitPats<GR16, i16, BTR16rr, BTS16rr, BTC16rr, shiftMask16>;
defm : OneBitPats<GR32, i32, BTR32rr, BTS32rr, BTC32rr, shiftMask32>;
defm : OneBitPats<GR64, i64, BTR64rr, BTS64rr, BTC64rr, shiftMask64>;
+// Flag-producing variants: reuse the BTR/BTS/BTC encodings so CF can replace
+// a separate X86ISD::BT. Emitted by a DAG combine in combineBT.
+multiclass OneBitFlagPats<RegisterClass rc, Instruction btr, Instruction bts,
+ Instruction btc, SDNode btr_flag, SDNode bts_flag,
+ SDNode btc_flag> {
+ def : Pat<(btr_flag rc:$src1, rc:$src2), (btr rc:$src1, rc:$src2)>;
+ def : Pat<(bts_flag rc:$src1, rc:$src2), (bts rc:$src1, rc:$src2)>;
+ def : Pat<(btc_flag rc:$src1, rc:$src2), (btc rc:$src1, rc:$src2)>;
+}
+
+defm : OneBitFlagPats<GR16, BTR16rr, BTS16rr, BTC16rr,
+ X86btr_flag, X86bts_flag, X86btc_flag>;
+defm : OneBitFlagPats<GR32, BTR32rr, BTS32rr, BTC32rr,
+ X86btr_flag, X86bts_flag, X86btc_flag>;
+defm : OneBitFlagPats<GR64, BTR64rr, BTS64rr, BTC64rr,
+ X86btr_flag, X86bts_flag, X86btc_flag>;
+
//===----------------------------------------------------------------------===//
// EFLAGS-defining Patterns
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index bae1211e51330..9c7832d5ba11a 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -165,6 +165,13 @@ let IsStrictFP = true in {
// X86 bit-test instructions.
def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>;
+// X86 bit-test-and-modify instructions: res, EFLAGS = op src, bitno.
+// CF is set from the pre-operation bit value; BT on the same operands is
+// therefore redundant after one of these nodes.
+def X86btr_flag : SDNode<"X86ISD::BTR_FLAG", SDTBinaryArithWithFlags>;
+def X86bts_flag : SDNode<"X86ISD::BTS_FLAG", SDTBinaryArithWithFlags>;
+def X86btc_flag : SDNode<"X86ISD::BTC_FLAG", SDTBinaryArithWithFlags>;
+
// Conditional compare instructions
def X86ccmp : SDNode<"X86ISD::CCMP", SDTX86Ccmp>;
def X86ctest : SDNode<"X86ISD::CTEST", SDTX86Ccmp>;
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 96ccc7b0f7527..6767cc45a2c5e 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -38,24 +38,18 @@ define i1 @test_eq_i32(ptr %word, i32 %position) nounwind {
define i1 @complement_ne_i32(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_ne_i32:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: btcl %eax, %esi
-; X86-NEXT: btl %eax, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: btcl %eax, %edx
; X86-NEXT: setb %al
-; X86-NEXT: movl %esi, (%ecx)
-; X86-NEXT: popl %esi
+; X86-NEXT: movl %edx, (%ecx)
; X86-NEXT: retl
;
; X64-LABEL: complement_ne_i32:
; X64: # %bb.0:
-; X64-NEXT: movl (%rdi), %eax
-; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: movl (%rdi), %ecx
; X64-NEXT: btcl %esi, %ecx
-; X64-NEXT: btl %esi, %eax
; X64-NEXT: setb %al
; X64-NEXT: movl %ecx, (%rdi)
; X64-NEXT: retq
@@ -72,24 +66,18 @@ define i1 @complement_ne_i32(ptr %word, i32 %position) nounwind {
define i1 @reset_eq_i32(ptr %word, i32 %position) nounwind {
; X86-LABEL: reset_eq_i32:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: btrl %eax, %esi
-; X86-NEXT: btl %eax, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: btrl %eax, %edx
; X86-NEXT: setae %al
-; X86-NEXT: movl %esi, (%ecx)
-; X86-NEXT: popl %esi
+; X86-NEXT: movl %edx, (%ecx)
; X86-NEXT: retl
;
; X64-LABEL: reset_eq_i32:
; X64: # %bb.0:
-; X64-NEXT: movl (%rdi), %eax
-; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: movl (%rdi), %ecx
; X64-NEXT: btrl %esi, %ecx
-; X64-NEXT: btl %esi, %eax
; X64-NEXT: setae %al
; X64-NEXT: movl %ecx, (%rdi)
; X64-NEXT: retq
@@ -107,24 +95,18 @@ define i1 @reset_eq_i32(ptr %word, i32 %position) nounwind {
define i1 @set_ne_i32(ptr %word, i32 %position) nounwind {
; X86-LABEL: set_ne_i32:
; X86: # %bb.0:
-; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl (%ecx), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: btsl %eax, %esi
-; X86-NEXT: btl %eax, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: btsl %eax, %edx
; X86-NEXT: setb %al
-; X86-NEXT: movl %esi, (%ecx)
-; X86-NEXT: popl %esi
+; X86-NEXT: movl %edx, (%ecx)
; X86-NEXT: retl
;
; X64-LABEL: set_ne_i32:
; X64: # %bb.0:
-; X64-NEXT: movl (%rdi), %eax
-; X64-NEXT: movl %eax, %ecx
+; X64-NEXT: movl (%rdi), %ecx
; X64-NEXT: btsl %esi, %ecx
-; X64-NEXT: btl %esi, %eax
; X64-NEXT: setb %al
; X64-NEXT: movl %ecx, (%rdi)
; X64-NEXT: retq
@@ -145,14 +127,12 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: shll %cl, %eax
-; X86-NEXT: movl (%edx), %esi
-; X86-NEXT: movl %esi, %edi
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: shll %cl, %esi
+; X86-NEXT: movl (%edx), %edi
; X86-NEXT: btrl %ecx, %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: btl %ecx, %esi
; X86-NEXT: setae %al
+; X86-NEXT: orl %esi, %edi
; X86-NEXT: movl %edi, (%edx)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -162,24 +142,20 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; SSE: # %bb.0:
; SSE-NEXT: movl %esi, %ecx
; SSE-NEXT: shll %cl, %edx
-; SSE-NEXT: movl (%rdi), %eax
-; SSE-NEXT: movl %eax, %esi
+; SSE-NEXT: movl (%rdi), %esi
; SSE-NEXT: btrl %ecx, %esi
-; SSE-NEXT: orl %edx, %esi
-; SSE-NEXT: btl %ecx, %eax
; SSE-NEXT: setae %al
+; SSE-NEXT: orl %edx, %esi
; SSE-NEXT: movl %esi, (%rdi)
; SSE-NEXT: retq
;
; AVX-LABEL: init_eq_i32:
; AVX: # %bb.0:
-; AVX-NEXT: shlxl %esi, %edx, %eax
-; AVX-NEXT: movl (%rdi), %ecx
-; AVX-NEXT: movl %ecx, %edx
+; AVX-NEXT: shlxl %esi, %edx, %ecx
+; AVX-NEXT: movl (%rdi), %edx
; AVX-NEXT: btrl %esi, %edx
-; AVX-NEXT: orl %eax, %edx
-; AVX-NEXT: btl %esi, %ecx
; AVX-NEXT: setae %al
+; AVX-NEXT: orl %ecx, %edx
; AVX-NEXT: movl %edx, (%rdi)
; AVX-NEXT: retq
%ofs = and i32 %position, 31
@@ -232,29 +208,24 @@ define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {
define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_ne_i64:
; X86: # %bb.0:
-; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: andl $32, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: andl $32, %edx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: movl (%ecx,%edx), %esi
+; X86-NEXT: btcl %eax, %esi
; X86-NEXT: setb %al
-; X86-NEXT: btcl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: movl %esi, (%ecx,%edx)
; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
; X86-NEXT: retl
;
; X64-LABEL: complement_ne_i64:
; X64: # %bb.0:
; X64-NEXT: # kill: def $esi killed $esi def $rsi
-; X64-NEXT: movq (%rdi), %rax
-; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: btcq %rsi, %rcx
-; X64-NEXT: btq %rsi, %rax
; X64-NEXT: setb %al
; X64-NEXT: movq %rcx, (%rdi)
; X64-NEXT: retq
@@ -272,29 +243,24 @@ define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: reset_eq_i64:
; X86: # %bb.0:
-; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: andl $32, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: andl $32, %edx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: movl (%ecx,%edx), %esi
+; X86-NEXT: btrl %eax, %esi
; X86-NEXT: setae %al
-; X86-NEXT: btrl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: movl %esi, (%ecx,%edx)
; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
; X86-NEXT: retl
;
; X64-LABEL: reset_eq_i64:
; X64: # %bb.0:
; X64-NEXT: # kill: def $esi killed $esi def $rsi
-; X64-NEXT: movq (%rdi), %rax
-; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: btrq %rsi, %rcx
-; X64-NEXT: btq %rsi, %rax
; X64-NEXT: setae %al
; X64-NEXT: movq %rcx, (%rdi)
; X64-NEXT: retq
@@ -313,29 +279,24 @@ define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: set_ne_i64:
; X86: # %bb.0:
-; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: andl $32, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: andl $32, %edx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: movl (%ecx,%edx), %esi
+; X86-NEXT: btsl %eax, %esi
; X86-NEXT: setb %al
-; X86-NEXT: btsl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: movl %esi, (%ecx,%edx)
; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
; X86-NEXT: retl
;
; X64-LABEL: set_ne_i64:
; X64: # %bb.0:
; X64-NEXT: # kill: def $esi killed $esi def $rsi
-; X64-NEXT: movq (%rdi), %rax
-; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: movq (%rdi), %rcx
; X64-NEXT: btsq %rsi, %rcx
-; X64-NEXT: btq %rsi, %rax
; X64-NEXT: setb %al
; X64-NEXT: movq %rcx, (%rdi)
; X64-NEXT: retq
@@ -362,9 +323,8 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-NEXT: andl $32, %esi
; X86-NEXT: shrl $3, %esi
; X86-NEXT: movl (%edx,%esi), %edi
-; X86-NEXT: btl %ecx, %edi
-; X86-NEXT: setae %al
; X86-NEXT: btrl %ecx, %edi
+; X86-NEXT: setae %al
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-NEXT: shll %cl, %ebx
@@ -378,14 +338,12 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; SSE-LABEL: init_eq_i64:
; SSE: # %bb.0:
; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: movl %edx, %eax
-; SSE-NEXT: shlq %cl, %rax
-; SSE-NEXT: movq (%rdi), %rdx
-; SSE-NEXT: movq %rdx, %rsi
+; SSE-NEXT: movl %edx, %edx
+; SSE-NEXT: shlq %cl, %rdx
+; SSE-NEXT: movq (%rdi), %rsi
; SSE-NEXT: btrq %rcx, %rsi
-; SSE-NEXT: orq %rax, %rsi
-; SSE-NEXT: btq %rcx, %rdx
; SSE-NEXT: setae %al
+; SSE-NEXT: orq %rdx, %rsi
; SSE-NEXT: movq %rsi, (%rdi)
; SSE-NEXT: retq
;
@@ -393,13 +351,11 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; AVX: # %bb.0:
; AVX-NEXT: # kill: def $esi killed $esi def $rsi
; AVX-NEXT: movl %edx, %eax
-; AVX-NEXT: shlxq %rsi, %rax, %rax
-; AVX-NEXT: movq (%rdi), %rcx
-; AVX-NEXT: movq %rcx, %rdx
+; AVX-NEXT: shlxq %rsi, %rax, %rcx
+; AVX-NEXT: movq (%rdi), %rdx
; AVX-NEXT: btrq %rsi, %rdx
-; AVX-NEXT: orq %rax, %rdx
-; AVX-NEXT: btq %rsi, %rcx
; AVX-NEXT: setae %al
+; AVX-NEXT: orq %rcx, %rdx
; AVX-NEXT: movq %rdx, (%rdi)
; AVX-NEXT: retq
%rem = and i32 %position, 63
@@ -455,20 +411,17 @@ define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {
define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_ne_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: andl $96, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: andl $96, %edx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: movl (%ecx,%edx), %esi
+; X86-NEXT: btcl %eax, %esi
; X86-NEXT: setb %al
-; X86-NEXT: btcl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: movl %esi, (%ecx,%edx)
; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
; X86-NEXT: retl
;
; X64-LABEL: complement_ne_i128:
@@ -477,9 +430,8 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
; X64-NEXT: andl $96, %ecx
; X64-NEXT: shrl $3, %ecx
; X64-NEXT: movl (%rdi,%rcx), %edx
-; X64-NEXT: btl %esi, %edx
-; X64-NEXT: setb %al
; X64-NEXT: btcl %esi, %edx
+; X64-NEXT: setb %al
; X64-NEXT: movl %edx, (%rdi,%rcx)
; X64-NEXT: retq
%rem = and i32 %position, 127
@@ -496,20 +448,17 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: reset_eq_i128:
; X86: # %bb.0:
-; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: andl $96, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: andl $96, %edx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: movl (%ecx,%edx), %esi
+; X86-NEXT: btrl %eax, %esi
; X86-NEXT: setae %al
-; X86-NEXT: btrl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: movl %esi, (%ecx,%edx)
; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
; X86-NEXT: retl
;
; X64-LABEL: reset_eq_i128:
@@ -518,9 +467,8 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
; X64-NEXT: andl $96, %ecx
; X64-NEXT: shrl $3, %ecx
; X64-NEXT: movl (%rdi,%rcx), %edx
-; X64-NEXT: btl %esi, %edx
-; X64-NEXT: setae %al
; X64-NEXT: btrl %esi, %edx
+; X64-NEXT: setae %al
; X64-NEXT: movl %edx, (%rdi,%rcx)
; X64-NEXT: retq
%rem = and i32 %position, 127
@@ -538,20 +486,17 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: set_ne_i128:
...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff origin/main HEAD --extensions cpp -- llvm/lib/Target/X86/X86ISelLowering.cpp --diff_from_common_commit
View the diff from clang-format here.diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7f885ec49..2a0d570ae 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -56774,8 +56774,7 @@ static SDValue combineBTToBitOpFlag(SDNode *N, SelectionDAG &DAG) {
// BTR/BTS/BTC *rr take the bit index in a register of the same width as
// the source. Extend or truncate to VT to match the instruction signature.
SDValue BN = DAG.getZExtOrTrunc(BitNo, DL, VT);
- SDValue New =
- DAG.getNode(FlagOp, DL, DAG.getVTList(VT, MVT::i32), Src, BN);
+ SDValue New = DAG.getNode(FlagOp, DL, DAG.getVTList(VT, MVT::i32), Src, BN);
// Reroute the value output through User's consumers.
DAG.ReplaceAllUsesOfValueWith(SDValue(User, 0), New.getValue(0));
// Return the flags output so combineBT installs it as N's replacement.
|
BTR/BTS/BTC set CF from the pre-operation bit value, so a subsequent BT
on the same source and bit index produces redundant EFLAGS. We were
emitting both, e.g. for `(and ld, ~bit) | (and ld, bit) != 0` we got:
btrl %esi, %ecx
btl %esi, %eax
setae %al
Introduce three flag-producing DAG nodes X86ISD::BTR/BTS/BTC that model
the register-register BTR/BTS/BTC as `(res, EFLAGS) = op src, bitno`
(the atomic locked variants live under X86ISD::LBTR/LBTS/LBTC already),
pattern-match them to the existing BTR/BTS/BTC encodings, and add a DAG
combine in combineBT that fuses an X86ISD::BT with a sibling
AND(Src, rotl -2, X) / OR(Src, shl 1, X) / XOR(Src, shl 1, X) on the
same source into a single flag-producing node. The bit-position operand
can differ from BT's by trunc/zext/and-with-mask that preserves the low
log2(BW) bits (BT already masks those implicitly), so peek through
those wrappers when matching.
After the fix the example lowers to:
btrl %esi, %ecx
setae %al
Fixes llvm#165291.
184a3a3 to
e9f8d75
Compare
Fixes #165291.
BTR/BTS/BTC set CF from the pre-operation bit value, so a subsequent BT on the same source and bit index produces the same CF and is redundant. We were emitting both.
Before:
After:
Approach
X86ISD::{BTR,BTS,BTC}_FLAGthat model the register-register BTR/BTS/BTC as(res, EFLAGS) = op src, bitno.combineBTthat fuses anX86ISD::BTwith a siblingAND(Src, rotl -2, X)/OR(Src, shl 1, X)/XOR(Src, shl 1, X)on the same source into a single flag-producing node. The bit-position operand can differ from BT's by trunc/zext or by anand x, Cthat preserves the lowlog2(BW)bits (BT already masks those implicitly), so peek through those wrappers when matching.llvm/test/CodeGen/X86/bittest-big-integer.llwas regenerated and shrinks by 78 CHECK lines; dozens ofbtlinstructions collapsed into the preceding bit-modify. All 5442 X86 CodeGen tests pass.