diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5ed2eed2a0e8d..8b54fbdad0e03 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -56672,6 +56672,111 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, return SDValue(); } +// Strip TRUNCATE/ZERO_EXTEND/ANY_EXTEND wrappers and `and x, C` where C +// preserves the low log2(BW) bits — these are transparent to BT/BTR/BTS/BTC, +// which implicitly mask the bit index to log2(BW) bits. +static SDValue peekThroughBitPosExtTrunc(SDValue V, unsigned BW) { + APInt LowBits = + APInt::getLowBitsSet(V.getScalarValueSizeInBits(), Log2_32(BW)); + for (;;) { + unsigned Op = V.getOpcode(); + if (Op == ISD::TRUNCATE || Op == ISD::ZERO_EXTEND || + Op == ISD::ANY_EXTEND) { + V = V.getOperand(0); + LowBits = LowBits.zextOrTrunc(V.getScalarValueSizeInBits()); + continue; + } + if (Op == ISD::AND) { + auto *C = dyn_cast(V.getOperand(1)); + if (C && LowBits.isSubsetOf(C->getAPIntValue())) { + V = V.getOperand(0); + continue; + } + } + return V; + } +} + +// Try to merge a (X86ISD::BT Src, BitNo) with a sibling bit-modifying op on +// Src — AND(Src, rotl -2, X), OR(Src, shl 1, X), XOR(Src, shl 1, X) — into a +// single flag-producing X86ISD::{BTR,BTS,BTC} node. Both BT and BTR/BTS/BTC +// set CF from the pre-op bit value, so one instruction subsumes the other. +// Fixes llvm#165291. +static SDValue combineBTToBitOpFlag(SDNode *N, SelectionDAG &DAG) { + SDValue Src = N->getOperand(0); + SDValue BitNo = N->getOperand(1); + EVT VT = Src.getValueType(); + SDLoc DL(N); + + // BT is only emitted for legal integer widths (16/32/64); match those. + if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + unsigned BW = VT.getScalarSizeInBits(); + SDValue PeeledBitNo = peekThroughBitPosExtTrunc(BitNo, BW); + + for (SDNode *User : Src->users()) { + if (User == N) + continue; + unsigned UOpc = User->getOpcode(); + if (UOpc != ISD::AND && UOpc != ISD::OR && UOpc != ISD::XOR) + continue; + if (User->getValueType(0) != VT) + continue; + + // Identify which operand of User is Src; the other is the mask. + SDValue UOp0 = User->getOperand(0); + SDValue UOp1 = User->getOperand(1); + SDValue Mask; + if (UOp0 == SDValue(Src.getNode(), Src.getResNo())) + Mask = UOp1; + else if (UOp1 == SDValue(Src.getNode(), Src.getResNo())) + Mask = UOp0; + else + continue; + // We will replace the mask's consumer (User); require the mask to have no + // other live uses so we can drop it. + if (!Mask.hasOneUse()) + continue; + + unsigned FlagOp = 0; + SDValue ShAmt; + if (UOpc == ISD::AND && Mask.getOpcode() == ISD::ROTL) { + // (and Src, (rotl -2, X)) — clears bit X. + if (auto *C = dyn_cast(Mask.getOperand(0))) + if (C->getAPIntValue() == APInt::getAllOnes(BW) - 1) { + FlagOp = X86ISD::BTR; + ShAmt = Mask.getOperand(1); + } + } else if ((UOpc == ISD::OR || UOpc == ISD::XOR) && + Mask.getOpcode() == ISD::SHL) { + // (or/xor Src, (shl 1, X)) — sets / flips bit X. + if (auto *C = dyn_cast(Mask.getOperand(0))) + if (C->getAPIntValue() == 1) { + FlagOp = UOpc == ISD::OR ? X86ISD::BTS : X86ISD::BTC; + ShAmt = Mask.getOperand(1); + } + } + if (!FlagOp) + continue; + + // The BT and the bit-op must address the same bit. They can differ only + // by truncation/extension or an AND that preserves the low log2(BW) bits. + if (peekThroughBitPosExtTrunc(ShAmt, BW) != PeeledBitNo) + continue; + + // BTR/BTS/BTC *rr take the bit index in a register of the same width as + // the source. Extend or truncate to VT to match the instruction signature. + SDValue BN = DAG.getZExtOrTrunc(BitNo, DL, VT); + SDValue New = DAG.getNode(FlagOp, DL, DAG.getVTList(VT, MVT::i32), Src, BN); + // Reroute the value output through User's consumers. + DAG.ReplaceAllUsesOfValueWith(SDValue(User, 0), New.getValue(0)); + // Return the flags output so combineBT installs it as N's replacement. + return New.getValue(1); + } + return SDValue(); +} + static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { SDValue N1 = N->getOperand(1); @@ -56685,6 +56790,9 @@ static SDValue combineBT(SDNode *N, SelectionDAG &DAG, return SDValue(N, 0); } + if (SDValue V = combineBTToBitOpFlag(N, DAG)) + return V; + return SDValue(); } diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index ebbfa48d2660c..735bc57e8eaba 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -2072,6 +2072,23 @@ defm : OneBitPats; defm : OneBitPats; defm : OneBitPats; +// Flag-producing variants: reuse the BTR/BTS/BTC encodings so CF can replace +// a separate X86ISD::BT. Emitted by a DAG combine in combineBT. +multiclass OneBitFlagPats { + def : Pat<(btr_flag rc:$src1, rc:$src2), (btr rc:$src1, rc:$src2)>; + def : Pat<(bts_flag rc:$src1, rc:$src2), (bts rc:$src1, rc:$src2)>; + def : Pat<(btc_flag rc:$src1, rc:$src2), (btc rc:$src1, rc:$src2)>; +} + +defm : OneBitFlagPats; +defm : OneBitFlagPats; +defm : OneBitFlagPats; + //===----------------------------------------------------------------------===// // EFLAGS-defining Patterns //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td index bae1211e51330..e096aa1d074fa 100644 --- a/llvm/lib/Target/X86/X86InstrFragments.td +++ b/llvm/lib/Target/X86/X86InstrFragments.td @@ -165,6 +165,14 @@ let IsStrictFP = true in { // X86 bit-test instructions. def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>; +// X86 bit-test-and-modify instructions: res, EFLAGS = op src, bitno. +// CF is set from the pre-operation bit value; BT on the same operands is +// therefore redundant after one of these nodes. The atomic (locked) variants +// live under X86ISD::LBTR/LBTS/LBTC. +def X86btr_flag : SDNode<"X86ISD::BTR", SDTBinaryArithWithFlags>; +def X86bts_flag : SDNode<"X86ISD::BTS", SDTBinaryArithWithFlags>; +def X86btc_flag : SDNode<"X86ISD::BTC", SDTBinaryArithWithFlags>; + // Conditional compare instructions def X86ccmp : SDNode<"X86ISD::CCMP", SDTX86Ccmp>; def X86ctest : SDNode<"X86ISD::CTEST", SDTX86Ccmp>; diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll index 96ccc7b0f7527..6767cc45a2c5e 100644 --- a/llvm/test/CodeGen/X86/bittest-big-integer.ll +++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll @@ -38,24 +38,18 @@ define i1 @test_eq_i32(ptr %word, i32 %position) nounwind { define i1 @complement_ne_i32(ptr %word, i32 %position) nounwind { ; X86-LABEL: complement_ne_i32: ; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: btcl %eax, %esi -; X86-NEXT: btl %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: btcl %eax, %edx ; X86-NEXT: setb %al -; X86-NEXT: movl %esi, (%ecx) -; X86-NEXT: popl %esi +; X86-NEXT: movl %edx, (%ecx) ; X86-NEXT: retl ; ; X64-LABEL: complement_ne_i32: ; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: movl %eax, %ecx +; X64-NEXT: movl (%rdi), %ecx ; X64-NEXT: btcl %esi, %ecx -; X64-NEXT: btl %esi, %eax ; X64-NEXT: setb %al ; X64-NEXT: movl %ecx, (%rdi) ; X64-NEXT: retq @@ -72,24 +66,18 @@ define i1 @complement_ne_i32(ptr %word, i32 %position) nounwind { define i1 @reset_eq_i32(ptr %word, i32 %position) nounwind { ; X86-LABEL: reset_eq_i32: ; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: btrl %eax, %esi -; X86-NEXT: btl %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: btrl %eax, %edx ; X86-NEXT: setae %al -; X86-NEXT: movl %esi, (%ecx) -; X86-NEXT: popl %esi +; X86-NEXT: movl %edx, (%ecx) ; X86-NEXT: retl ; ; X64-LABEL: reset_eq_i32: ; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: movl %eax, %ecx +; X64-NEXT: movl (%rdi), %ecx ; X64-NEXT: btrl %esi, %ecx -; X64-NEXT: btl %esi, %eax ; X64-NEXT: setae %al ; X64-NEXT: movl %ecx, (%rdi) ; X64-NEXT: retq @@ -107,24 +95,18 @@ define i1 @reset_eq_i32(ptr %word, i32 %position) nounwind { define i1 @set_ne_i32(ptr %word, i32 %position) nounwind { ; X86-LABEL: set_ne_i32: ; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl (%ecx), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: btsl %eax, %esi -; X86-NEXT: btl %eax, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: btsl %eax, %edx ; X86-NEXT: setb %al -; X86-NEXT: movl %esi, (%ecx) -; X86-NEXT: popl %esi +; X86-NEXT: movl %edx, (%ecx) ; X86-NEXT: retl ; ; X64-LABEL: set_ne_i32: ; X64: # %bb.0: -; X64-NEXT: movl (%rdi), %eax -; X64-NEXT: movl %eax, %ecx +; X64-NEXT: movl (%rdi), %ecx ; X64-NEXT: btsl %esi, %ecx -; X64-NEXT: btl %esi, %eax ; X64-NEXT: setb %al ; X64-NEXT: movl %ecx, (%rdi) ; X64-NEXT: retq @@ -145,14 +127,12 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: shll %cl, %eax -; X86-NEXT: movl (%edx), %esi -; X86-NEXT: movl %esi, %edi +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %esi +; X86-NEXT: shll %cl, %esi +; X86-NEXT: movl (%edx), %edi ; X86-NEXT: btrl %ecx, %edi -; X86-NEXT: orl %eax, %edi -; X86-NEXT: btl %ecx, %esi ; X86-NEXT: setae %al +; X86-NEXT: orl %esi, %edi ; X86-NEXT: movl %edi, (%edx) ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi @@ -162,24 +142,20 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; SSE: # %bb.0: ; SSE-NEXT: movl %esi, %ecx ; SSE-NEXT: shll %cl, %edx -; SSE-NEXT: movl (%rdi), %eax -; SSE-NEXT: movl %eax, %esi +; SSE-NEXT: movl (%rdi), %esi ; SSE-NEXT: btrl %ecx, %esi -; SSE-NEXT: orl %edx, %esi -; SSE-NEXT: btl %ecx, %eax ; SSE-NEXT: setae %al +; SSE-NEXT: orl %edx, %esi ; SSE-NEXT: movl %esi, (%rdi) ; SSE-NEXT: retq ; ; AVX-LABEL: init_eq_i32: ; AVX: # %bb.0: -; AVX-NEXT: shlxl %esi, %edx, %eax -; AVX-NEXT: movl (%rdi), %ecx -; AVX-NEXT: movl %ecx, %edx +; AVX-NEXT: shlxl %esi, %edx, %ecx +; AVX-NEXT: movl (%rdi), %edx ; AVX-NEXT: btrl %esi, %edx -; AVX-NEXT: orl %eax, %edx -; AVX-NEXT: btl %esi, %ecx ; AVX-NEXT: setae %al +; AVX-NEXT: orl %ecx, %edx ; AVX-NEXT: movl %edx, (%rdi) ; AVX-NEXT: retq %ofs = and i32 %position, 31 @@ -232,29 +208,24 @@ define i1 @test_ne_i64(ptr %word, i32 %position) nounwind { define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: complement_ne_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: andl $32, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $32, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: movl (%ecx,%edx), %esi +; X86-NEXT: btcl %eax, %esi ; X86-NEXT: setb %al -; X86-NEXT: btcl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: movl %esi, (%ecx,%edx) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: complement_ne_i64: ; X64: # %bb.0: ; X64-NEXT: # kill: def $esi killed $esi def $rsi -; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: btcq %rsi, %rcx -; X64-NEXT: btq %rsi, %rax ; X64-NEXT: setb %al ; X64-NEXT: movq %rcx, (%rdi) ; X64-NEXT: retq @@ -272,29 +243,24 @@ define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind { define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: reset_eq_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: andl $32, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $32, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: movl (%ecx,%edx), %esi +; X86-NEXT: btrl %eax, %esi ; X86-NEXT: setae %al -; X86-NEXT: btrl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: movl %esi, (%ecx,%edx) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: reset_eq_i64: ; X64: # %bb.0: ; X64-NEXT: # kill: def $esi killed $esi def $rsi -; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: btrq %rsi, %rcx -; X64-NEXT: btq %rsi, %rax ; X64-NEXT: setae %al ; X64-NEXT: movq %rcx, (%rdi) ; X64-NEXT: retq @@ -313,29 +279,24 @@ define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind { define i1 @set_ne_i64(ptr %word, i32 %position) nounwind { ; X86-LABEL: set_ne_i64: ; X86: # %bb.0: -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: andl $32, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $32, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: movl (%ecx,%edx), %esi +; X86-NEXT: btsl %eax, %esi ; X86-NEXT: setb %al -; X86-NEXT: btsl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: movl %esi, (%ecx,%edx) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: set_ne_i64: ; X64: # %bb.0: ; X64-NEXT: # kill: def $esi killed $esi def $rsi -; X64-NEXT: movq (%rdi), %rax -; X64-NEXT: movq %rax, %rcx +; X64-NEXT: movq (%rdi), %rcx ; X64-NEXT: btsq %rsi, %rcx -; X64-NEXT: btq %rsi, %rax ; X64-NEXT: setb %al ; X64-NEXT: movq %rcx, (%rdi) ; X64-NEXT: retq @@ -362,9 +323,8 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: andl $32, %esi ; X86-NEXT: shrl $3, %esi ; X86-NEXT: movl (%edx,%esi), %edi -; X86-NEXT: btl %ecx, %edi -; X86-NEXT: setae %al ; X86-NEXT: btrl %ecx, %edi +; X86-NEXT: setae %al ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NEXT: shll %cl, %ebx @@ -378,14 +338,12 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; SSE-LABEL: init_eq_i64: ; SSE: # %bb.0: ; SSE-NEXT: movl %esi, %ecx -; SSE-NEXT: movl %edx, %eax -; SSE-NEXT: shlq %cl, %rax -; SSE-NEXT: movq (%rdi), %rdx -; SSE-NEXT: movq %rdx, %rsi +; SSE-NEXT: movl %edx, %edx +; SSE-NEXT: shlq %cl, %rdx +; SSE-NEXT: movq (%rdi), %rsi ; SSE-NEXT: btrq %rcx, %rsi -; SSE-NEXT: orq %rax, %rsi -; SSE-NEXT: btq %rcx, %rdx ; SSE-NEXT: setae %al +; SSE-NEXT: orq %rdx, %rsi ; SSE-NEXT: movq %rsi, (%rdi) ; SSE-NEXT: retq ; @@ -393,13 +351,11 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; AVX: # %bb.0: ; AVX-NEXT: # kill: def $esi killed $esi def $rsi ; AVX-NEXT: movl %edx, %eax -; AVX-NEXT: shlxq %rsi, %rax, %rax -; AVX-NEXT: movq (%rdi), %rcx -; AVX-NEXT: movq %rcx, %rdx +; AVX-NEXT: shlxq %rsi, %rax, %rcx +; AVX-NEXT: movq (%rdi), %rdx ; AVX-NEXT: btrq %rsi, %rdx -; AVX-NEXT: orq %rax, %rdx -; AVX-NEXT: btq %rsi, %rcx ; AVX-NEXT: setae %al +; AVX-NEXT: orq %rcx, %rdx ; AVX-NEXT: movq %rdx, (%rdi) ; AVX-NEXT: retq %rem = and i32 %position, 63 @@ -455,20 +411,17 @@ define i1 @test_ne_i128(ptr %word, i32 %position) nounwind { define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: complement_ne_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: andl $96, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $96, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: movl (%ecx,%edx), %esi +; X86-NEXT: btcl %eax, %esi ; X86-NEXT: setb %al -; X86-NEXT: btcl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: movl %esi, (%ecx,%edx) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: complement_ne_i128: @@ -477,9 +430,8 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind { ; X64-NEXT: andl $96, %ecx ; X64-NEXT: shrl $3, %ecx ; X64-NEXT: movl (%rdi,%rcx), %edx -; X64-NEXT: btl %esi, %edx -; X64-NEXT: setb %al ; X64-NEXT: btcl %esi, %edx +; X64-NEXT: setb %al ; X64-NEXT: movl %edx, (%rdi,%rcx) ; X64-NEXT: retq %rem = and i32 %position, 127 @@ -496,20 +448,17 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind { define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: reset_eq_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: andl $96, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $96, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: movl (%ecx,%edx), %esi +; X86-NEXT: btrl %eax, %esi ; X86-NEXT: setae %al -; X86-NEXT: btrl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: movl %esi, (%ecx,%edx) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: reset_eq_i128: @@ -518,9 +467,8 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind { ; X64-NEXT: andl $96, %ecx ; X64-NEXT: shrl $3, %ecx ; X64-NEXT: movl (%rdi,%rcx), %edx -; X64-NEXT: btl %esi, %edx -; X64-NEXT: setae %al ; X64-NEXT: btrl %esi, %edx +; X64-NEXT: setae %al ; X64-NEXT: movl %edx, (%rdi,%rcx) ; X64-NEXT: retq %rem = and i32 %position, 127 @@ -538,20 +486,17 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind { define i1 @set_ne_i128(ptr %word, i32 %position) nounwind { ; X86-LABEL: set_ne_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: andl $96, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: andl $96, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: movl (%ecx,%edx), %esi +; X86-NEXT: btsl %eax, %esi ; X86-NEXT: setb %al -; X86-NEXT: btsl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: movl %esi, (%ecx,%edx) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: set_ne_i128: @@ -560,9 +505,8 @@ define i1 @set_ne_i128(ptr %word, i32 %position) nounwind { ; X64-NEXT: andl $96, %ecx ; X64-NEXT: shrl $3, %ecx ; X64-NEXT: movl (%rdi,%rcx), %edx -; X64-NEXT: btl %esi, %edx -; X64-NEXT: setb %al ; X64-NEXT: btsl %esi, %edx +; X64-NEXT: setb %al ; X64-NEXT: movl %edx, (%rdi,%rcx) ; X64-NEXT: retq %rem = and i32 %position, 127 @@ -588,9 +532,8 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: andl $96, %esi ; X86-NEXT: shrl $3, %esi ; X86-NEXT: movl (%edx,%esi), %edi -; X86-NEXT: btl %ecx, %edi -; X86-NEXT: setae %al ; X86-NEXT: btrl %ecx, %edi +; X86-NEXT: setae %al ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NEXT: shll %cl, %ebx @@ -607,10 +550,10 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; SSE-NEXT: andl $96, %esi ; SSE-NEXT: shrl $3, %esi ; SSE-NEXT: movl (%rdi,%rsi), %r8d -; SSE-NEXT: btl %ecx, %r8d +; SSE-NEXT: btrl %ecx, %r8d ; SSE-NEXT: setae %al +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx ; SSE-NEXT: shll %cl, %edx -; SSE-NEXT: btrl %ecx, %r8d ; SSE-NEXT: orl %r8d, %edx ; SSE-NEXT: movl %edx, (%rdi,%rsi) ; SSE-NEXT: retq @@ -621,9 +564,8 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; AVX-NEXT: andl $96, %ecx ; AVX-NEXT: shrl $3, %ecx ; AVX-NEXT: movl (%rdi,%rcx), %r8d -; AVX-NEXT: btl %esi, %r8d -; AVX-NEXT: setae %al ; AVX-NEXT: btrl %esi, %r8d +; AVX-NEXT: setae %al ; AVX-NEXT: shlxl %esi, %edx, %edx ; AVX-NEXT: orl %r8d, %edx ; AVX-NEXT: movl %edx, (%rdi,%rcx) @@ -679,20 +621,17 @@ define i1 @test_ne_i512(ptr %word, i32 %position) nounwind { define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { ; X86-LABEL: complement_ne_i512: ; X86: # %bb.0: -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: andl $60, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: andl $60, %edx +; X86-NEXT: movl (%ecx,%edx), %esi +; X86-NEXT: btcl %eax, %esi ; X86-NEXT: setb %al -; X86-NEXT: btcl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: movl %esi, (%ecx,%edx) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: complement_ne_i512: @@ -701,9 +640,8 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { ; X64-NEXT: shrl $3, %ecx ; X64-NEXT: andl $60, %ecx ; X64-NEXT: movl (%rdi,%rcx), %edx -; X64-NEXT: btl %esi, %edx -; X64-NEXT: setb %al ; X64-NEXT: btcl %esi, %edx +; X64-NEXT: setb %al ; X64-NEXT: movl %edx, (%rdi,%rcx) ; X64-NEXT: retq %rem = and i32 %position, 511 @@ -720,20 +658,17 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind { define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind { ; X86-LABEL: reset_eq_i512: ; X86: # %bb.0: -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: andl $60, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: andl $60, %edx +; X86-NEXT: movl (%ecx,%edx), %esi +; X86-NEXT: btrl %eax, %esi ; X86-NEXT: setae %al -; X86-NEXT: btrl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: movl %esi, (%ecx,%edx) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: reset_eq_i512: @@ -742,9 +677,8 @@ define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind { ; X64-NEXT: shrl $3, %ecx ; X64-NEXT: andl $60, %ecx ; X64-NEXT: movl (%rdi,%rcx), %edx -; X64-NEXT: btl %esi, %edx -; X64-NEXT: setae %al ; X64-NEXT: btrl %esi, %edx +; X64-NEXT: setae %al ; X64-NEXT: movl %edx, (%rdi,%rcx) ; X64-NEXT: retq %rem = and i32 %position, 511 @@ -762,20 +696,17 @@ define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind { define i1 @set_ne_i512(ptr %word, i32 %position) nounwind { ; X86-LABEL: set_ne_i512: ; X86: # %bb.0: -; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shrl $3, %esi -; X86-NEXT: andl $60, %esi -; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: btl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %edx +; X86-NEXT: shrl $3, %edx +; X86-NEXT: andl $60, %edx +; X86-NEXT: movl (%ecx,%edx), %esi +; X86-NEXT: btsl %eax, %esi ; X86-NEXT: setb %al -; X86-NEXT: btsl %edx, %edi -; X86-NEXT: movl %edi, (%ecx,%esi) +; X86-NEXT: movl %esi, (%ecx,%edx) ; X86-NEXT: popl %esi -; X86-NEXT: popl %edi ; X86-NEXT: retl ; ; X64-LABEL: set_ne_i512: @@ -784,9 +715,8 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind { ; X64-NEXT: shrl $3, %ecx ; X64-NEXT: andl $60, %ecx ; X64-NEXT: movl (%rdi,%rcx), %edx -; X64-NEXT: btl %esi, %edx -; X64-NEXT: setb %al ; X64-NEXT: btsl %esi, %edx +; X64-NEXT: setb %al ; X64-NEXT: movl %edx, (%rdi,%rcx) ; X64-NEXT: retq %rem = and i32 %position, 511 @@ -812,9 +742,8 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; X86-NEXT: shrl $3, %esi ; X86-NEXT: andl $60, %esi ; X86-NEXT: movl (%edx,%esi), %edi -; X86-NEXT: btl %ecx, %edi -; X86-NEXT: setae %al ; X86-NEXT: btrl %ecx, %edi +; X86-NEXT: setae %al ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NEXT: shll %cl, %ebx @@ -831,10 +760,10 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; SSE-NEXT: shrl $3, %esi ; SSE-NEXT: andl $60, %esi ; SSE-NEXT: movl (%rdi,%rsi), %r8d -; SSE-NEXT: btl %ecx, %r8d +; SSE-NEXT: btrl %ecx, %r8d ; SSE-NEXT: setae %al +; SSE-NEXT: # kill: def $cl killed $cl killed $ecx ; SSE-NEXT: shll %cl, %edx -; SSE-NEXT: btrl %ecx, %r8d ; SSE-NEXT: orl %r8d, %edx ; SSE-NEXT: movl %edx, (%rdi,%rsi) ; SSE-NEXT: retq @@ -845,9 +774,8 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind { ; AVX-NEXT: shrl $3, %ecx ; AVX-NEXT: andl $60, %ecx ; AVX-NEXT: movl (%rdi,%rcx), %r8d -; AVX-NEXT: btl %esi, %r8d -; AVX-NEXT: setae %al ; AVX-NEXT: btrl %esi, %r8d +; AVX-NEXT: setae %al ; AVX-NEXT: shlxl %esi, %edx, %edx ; AVX-NEXT: orl %r8d, %edx ; AVX-NEXT: movl %edx, (%rdi,%rcx) @@ -1103,28 +1031,24 @@ define <8 x i16> @complement_ne_i128_bitcast(ptr %word, i32 %position) nounwind define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { ; X86-LABEL: reset_multiload_i128: ; X86: # %bb.0: -; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl (%eax), %eax ; X86-NEXT: movl %edx, %esi ; X86-NEXT: andl $96, %esi ; X86-NEXT: shrl $3, %esi ; X86-NEXT: movl (%ecx,%esi), %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: btrl %edx, %ebx -; X86-NEXT: btl %edx, %edi -; X86-NEXT: movl %ebx, (%ecx,%esi) +; X86-NEXT: btrl %edx, %edi +; X86-NEXT: movl %edi, (%ecx,%esi) ; X86-NEXT: jae .LBB23_2 ; X86-NEXT: # %bb.1: ; X86-NEXT: xorl %eax, %eax ; X86-NEXT: .LBB23_2: ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi -; X86-NEXT: popl %ebx ; X86-NEXT: retl ; ; X64-LABEL: reset_multiload_i128: @@ -1132,11 +1056,9 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind { ; X64-NEXT: movl %esi, %ecx ; X64-NEXT: andl $96, %ecx ; X64-NEXT: shrl $3, %ecx -; X64-NEXT: movl (%rdi,%rcx), %r9d -; X64-NEXT: movl %r9d, %r8d -; X64-NEXT: btrl %esi, %r8d +; X64-NEXT: movl (%rdi,%rcx), %r8d ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: btl %esi, %r9d +; X64-NEXT: btrl %esi, %r8d ; X64-NEXT: jb .LBB23_2 ; X64-NEXT: # %bb.1: ; X64-NEXT: movl (%rdx), %eax