[gfx1250][SDAG] Lower unsafe bf16 divisions#149628
Merged
Conversation
Contributor
Author
This stack of pull requests is managed by Graphite. Learn more about stacking. |
Member
|
@llvm/pr-subscribers-backend-amdgpu Author: Shilei Tian (shiltian) ChangesCo-authored-by: Kosarev, Ivan <Ivan.Kosarev@amd.com> Full diff: https://github.com/llvm/llvm-project/pull/149628.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0c76ff2ec5ea7..79487dcec3525 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -618,6 +618,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::FSIN, ISD::FROUND},
MVT::f16, Custom);
+ // BF16 - VOP1 Actions.
+ if (Subtarget->hasBF16TransInsts())
+ setOperationAction(ISD::FDIV, MVT::bf16, Custom);
+
setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::bf16, Promote);
@@ -11172,7 +11176,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
// Without !fpmath accuracy information, we can't do more because we don't
// know exactly whether rcp is accurate enough to meet !fpmath requirement.
// f16 is always accurate enough
- if (!AllowInaccurateRcp && VT != MVT::f16)
+ if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
return SDValue();
if (CLHS->isExactlyValue(1.0)) {
@@ -11199,9 +11203,10 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
}
}
- // For f16 require afn or arcp.
+ // For f16 and bf16 require afn or arcp.
// For f32 require afn.
- if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
+ if (!AllowInaccurateRcp &&
+ ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
return SDValue();
// Turn into multiply by the reciprocal.
@@ -11592,7 +11597,7 @@ SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
if (VT == MVT::f64)
return LowerFDIV64(Op, DAG);
- if (VT == MVT::f16)
+ if (VT == MVT::f16 || VT == MVT::bf16)
return LowerFDIV16(Op, DAG);
llvm_unreachable("Unexpected type for fdiv");
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll
new file mode 100644
index 0000000000000..b564417192b08
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.bf16.ll
@@ -0,0 +1,298 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1250-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s
+
+/* TODO: Support safe bf16 fdiv lowering.
+define bfloat @v_fdiv_bf16(bfloat %x, bfloat %y) {
+ %fdiv = fdiv bfloat %x, %y
+ ret bfloat %fdiv
+}
+*/
+
+define bfloat @v_rcp_bf16(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rcp_bf16:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rcp_bf16:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %fdiv = fdiv bfloat 1.0, %x
+ ret bfloat %fdiv
+}
+
+define bfloat @v_rcp_bf16_abs(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rcp_bf16_abs:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, |v0.l|
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rcp_bf16_abs:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, |v0|
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %fabs = call bfloat @llvm.fabs.bf16(bfloat %x)
+ %fdiv = fdiv bfloat 1.0, %fabs
+ ret bfloat %fdiv
+}
+
+define bfloat @v_rcp_bf16_afn(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rcp_bf16_afn:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rcp_bf16_afn:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %fdiv = fdiv afn bfloat 1.0, %x
+ ret bfloat %fdiv
+}
+
+define bfloat @v_rcp_bf16_neg(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rcp_bf16_neg:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rcp_bf16_neg:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %fdiv = fdiv bfloat -1.0, %x
+ ret bfloat %fdiv
+}
+
+; TODO: Support lowering to v_rsq_bf16.
+define bfloat @v_rsq_bf16(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rsq_bf16:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rsq_bf16:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
+ %fdiv = fdiv contract bfloat 1.0, %sqrt
+ ret bfloat %fdiv
+}
+
+; TODO: Support lowering to v_rsq_bf16.
+define bfloat @v_rsq_bf16_neg(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rsq_bf16_neg:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rsq_bf16_neg:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
+ %fdiv = fdiv contract bfloat -1.0, %sqrt
+ ret bfloat %fdiv
+}
+
+; TODO: Support lowering to v_rsq_bf16.
+define <2 x bfloat> @v_rsq_bf16_multi_use(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rsq_bf16_multi_use:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v1.l, v1.l
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v1.h, v1.l
+; GFX1250-TRUE16-NEXT: v_nop
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v0, v1
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rsq_bf16_multi_use:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v1, v1
+; GFX1250-FAKE16-NEXT: v_nop
+; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
+ %fdiv = fdiv contract bfloat 1.0, %sqrt
+ %r = insertelement <2 x bfloat> zeroinitializer, bfloat %x, i32 0
+ %r2 = insertelement <2 x bfloat> %r, bfloat %fdiv, i32 1
+ ret <2 x bfloat> %r2
+}
+
+; TODO: Support lowering to v_rsq_bf16.
+define bfloat @v_rsq_bf16_missing_contract0(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rsq_bf16_missing_contract0:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rsq_bf16_missing_contract0:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %sqrt = call bfloat @llvm.sqrt.bf16(bfloat %x)
+ %fdiv = fdiv contract bfloat 1.0, %sqrt
+ ret bfloat %fdiv
+}
+
+; TODO: Support lowering to v_rsq_bf16.
+define bfloat @v_rsq_bf16_missing_contract1(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_rsq_bf16_missing_contract1:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rsq_bf16_missing_contract1:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
+ %fdiv = fdiv bfloat 1.0, %sqrt
+ ret bfloat %fdiv
+}
+
+; TODO: Support lowering to v_rsq_bf16.
+define bfloat @v_neg_rsq_bf16_missing_contract1(bfloat %x) {
+; GFX1250-TRUE16-LABEL: v_neg_rsq_bf16_missing_contract1:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_neg_rsq_bf16_missing_contract1:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %sqrt = call contract bfloat @llvm.sqrt.bf16(bfloat %x)
+ %fdiv = fdiv bfloat -1.0, %sqrt
+ ret bfloat %fdiv
+}
+
+define <2 x bfloat> @v_rsq_v2bf16(<2 x bfloat> %a) {
+; GFX1250-TRUE16-LABEL: v_rsq_v2bf16:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.h, v0.h
+; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2)
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.h, v0.h
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_rsq_v2bf16:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
+; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v1
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e32 v1, v1
+; GFX1250-FAKE16-NEXT: v_nop
+; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %sqrt = call contract <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a)
+ %fdiv = fdiv contract <2 x bfloat> <bfloat 1.0, bfloat 1.0>, %sqrt
+ ret <2 x bfloat> %fdiv
+}
+
+define <2 x bfloat> @v_neg_rsq_v2bf16(<2 x bfloat> %a) {
+; GFX1250-TRUE16-LABEL: v_neg_rsq_v2bf16:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.h, v0.h
+; GFX1250-TRUE16-NEXT: v_sqrt_bf16_e32 v0.l, v0.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_2)
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.h, -v0.h
+; GFX1250-TRUE16-NEXT: v_rcp_bf16_e64 v0.l, -v0.l
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: v_neg_rsq_v2bf16:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v0, v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_2)
+; GFX1250-FAKE16-NEXT: v_sqrt_bf16_e32 v1, v1
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v0, -v0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1250-FAKE16-NEXT: v_rcp_bf16_e64 v1, -v1
+; GFX1250-FAKE16-NEXT: v_nop
+; GFX1250-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %sqrt = call contract <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a)
+ %fdiv = fdiv contract <2 x bfloat> <bfloat -1.0, bfloat -1.0>, %sqrt
+ ret <2 x bfloat> %fdiv
+}
|
shiltian
commented
Jul 19, 2025
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1250-TRUE16 %s | ||
| ; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1250-FAKE16 %s | ||
|
|
||
| /* TODO: Support safe bf16 fdiv lowering. |
Contributor
Author
There was a problem hiding this comment.
This file has fewer tests compared to the downstream version because those are not supported at this moment.
rampitec
approved these changes
Jul 21, 2025
arsenm
reviewed
Jul 21, 2025
4adb8ff to
46a6dd7
Compare
251b027 to
622cf01
Compare
Contributor
Author
46a6dd7 to
c0d9363
Compare
Base automatically changed from
users/shiltian/add-missing-test-case-v_cvt_f16_bf8
to
main
July 21, 2025 14:54
Co-authored-by: Kosarev, Ivan <Ivan.Kosarev@amd.com>
622cf01 to
fa52b7e
Compare
This was referenced Jul 23, 2025
mahesh-attarde
pushed a commit
to mahesh-attarde/llvm-project
that referenced
this pull request
Jul 28, 2025
Co-authored-by: Kosarev, Ivan <Ivan.Kosarev@amd.com>
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.

Co-authored-by: Kosarev, Ivan Ivan.Kosarev@amd.com