[AMDGPU] Add support for v_tanh_f16 on gfx1250#149439
Conversation
Co-authored-by: Mekhanoshin, Stanislav <Stanislav.Mekhanoshin@amd.com>
|
@llvm/pr-subscribers-mc @llvm/pr-subscribers-clang Author: Shilei Tian (shiltian) ChangesCo-authored-by: Mekhanoshin, Stanislav <Stanislav.Mekhanoshin@amd.com> Patch is 70.80 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149439.diff 23 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 4111837d962b5..ed51f1d5de447 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -670,6 +670,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wait_asynccnt, "vIUs", "n", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_tanhf, "ff", "nc", "tanh-insts")
+TARGET_BUILTIN(__builtin_amdgcn_tanhh, "hh", "nc", "tanh-insts")
TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts")
TARGET_BUILTIN(__builtin_amdgcn_rcp_bf16, "yy", "nc", "bf16-trans-insts")
TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts")
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index bcdb488f11639..a7d796ecccc61 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -504,6 +504,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
return Builder.CreateCall(F, { Src });
}
case AMDGPU::BI__builtin_amdgcn_tanhf:
+ case AMDGPU::BI__builtin_amdgcn_tanhh:
case AMDGPU::BI__builtin_amdgcn_tanh_bf16:
return emitBuiltinWithOneOverloadedType<1>(*this, E,
Intrinsic::amdgcn_tanh);
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index e120a46c6327b..738b7ab7f2b75 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -61,6 +61,26 @@ void test_tanh_f32(global float* out, float a)
*out = __builtin_amdgcn_tanhf(a);
}
+// CHECK-LABEL: @test_tanh_f16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store ptr addrspace(1) [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-NEXT: [[TMP1:%.*]] = load half, ptr addrspace(1) [[TMP0]], align 2
+// CHECK-NEXT: [[TMP2:%.*]] = call half @llvm.amdgcn.tanh.f16(half [[TMP1]])
+// CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store half [[TMP2]], ptr addrspace(1) [[TMP3]], align 2
+// CHECK-NEXT: ret void
+//
+void test_tanh_f16(global half* out, global half* a)
+{
+ *out = __builtin_amdgcn_tanhh(*a);
+}
+
// CHECK-LABEL: @test_tanh_bf16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 8c35fea8259f4..1bbbb610305e9 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -530,6 +530,10 @@ defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, AMDGPUexpf16>;
defm V_SIN_F16 : VOP1Inst_t16 <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
defm V_COS_F16 : VOP1Inst_t16 <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
+let SubtargetPredicate = HasTanhInsts in {
+defm V_TANH_F16 : VOP1Inst_t16 <"v_tanh_f16", VOP_F16_F16, int_amdgcn_tanh>;
+}
+
let SubtargetPredicate = HasBF16TransInsts in {
defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>;
defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>;
@@ -1142,6 +1146,7 @@ defm V_CVT_F32_F16 : VOP1_Real_FULL_t16_and_fake16_gfx11_gfx12<0x00b>;
defm V_MOV_B64 : VOP1_Real_FULL <GFX1250Gen, 0x1d>;
defm V_TANH_F32 : VOP1_Real_FULL<GFX1250Gen, 0x01e>;
+defm V_TANH_F16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x01f>;
defm V_TANH_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x04a>;
defm V_CVT_F32_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x072, "v_cvt_f32_bf16", "V_CVT_F32_BF16_gfx1250">;
defm V_CVT_PK_F16_FP8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x075>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll
index 81db7354757d9..dd89f80a54949 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll
@@ -7,6 +7,7 @@
; FIXME: GlobalISel does not work with bf16
declare float @llvm.amdgcn.tanh.f32(float) #0
+declare half @llvm.amdgcn.tanh.f16(half) #0
declare bfloat @llvm.amdgcn.tanh.bf16(bfloat) #0
define amdgpu_kernel void @tanh_f32(ptr addrspace(1) %out, float %src) #1 {
@@ -92,6 +93,88 @@ define amdgpu_kernel void @tanh_undef_f32(ptr addrspace(1) %out) #1 {
ret void
}
+define amdgpu_kernel void @tanh_f16(ptr addrspace(1) %out, half %src) #1 {
+; SDAG-REAL16-LABEL: tanh_f16:
+; SDAG-REAL16: ; %bb.0:
+; SDAG-REAL16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT: v_tanh_f16_e32 v0.l, s2
+; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT: s_endpgm
+;
+; SDAG-FAKE16-LABEL: tanh_f16:
+; SDAG-FAKE16: ; %bb.0:
+; SDAG-FAKE16-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT: v_tanh_f16_e32 v0, s2
+; SDAG-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; SDAG-FAKE16-NEXT: s_endpgm
+ %tanh = call half @llvm.amdgcn.tanh.f16(half %src) #0
+ store half %tanh, ptr addrspace(1) %out, align 2
+ ret void
+}
+
+define amdgpu_kernel void @tanh_f16_constant_4.0(ptr addrspace(1) %out) #1 {
+; SDAG-REAL16-LABEL: tanh_f16_constant_4.0:
+; SDAG-REAL16: ; %bb.0:
+; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-REAL16-NEXT: v_tanh_f16_e32 v0.l, 4.0
+; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT: s_endpgm
+;
+; SDAG-FAKE16-LABEL: tanh_f16_constant_4.0:
+; SDAG-FAKE16: ; %bb.0:
+; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-FAKE16-NEXT: v_tanh_f16_e32 v0, 4.0
+; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; SDAG-FAKE16-NEXT: s_endpgm
+ %tanh = call half @llvm.amdgcn.tanh.f16(half 4.0) #0
+ store half %tanh, ptr addrspace(1) %out, align 2
+ ret void
+}
+
+define amdgpu_kernel void @tanh_f16_constant_100.0(ptr addrspace(1) %out) #1 {
+; SDAG-REAL16-LABEL: tanh_f16_constant_100.0:
+; SDAG-REAL16: ; %bb.0:
+; SDAG-REAL16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-REAL16-NEXT: v_tanh_f16_e32 v0.l, 0x5640
+; SDAG-REAL16-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-REAL16-NEXT: s_wait_kmcnt 0x0
+; SDAG-REAL16-NEXT: flat_store_b16 v1, v0, s[0:1]
+; SDAG-REAL16-NEXT: s_endpgm
+;
+; SDAG-FAKE16-LABEL: tanh_f16_constant_100.0:
+; SDAG-FAKE16: ; %bb.0:
+; SDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; SDAG-FAKE16-NEXT: v_tanh_f16_e32 v0, 0x5640
+; SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; SDAG-FAKE16-NEXT: global_store_b16 v1, v0, s[0:1]
+; SDAG-FAKE16-NEXT: s_endpgm
+ %tanh = call half @llvm.amdgcn.tanh.f16(half 100.0) #0
+ store half %tanh, ptr addrspace(1) %out, align 2
+ ret void
+}
+
+define amdgpu_kernel void @tanh_undef_f16(ptr addrspace(1) %out) #1 {
+; SDAG-REAL16-LABEL: tanh_undef_f16:
+; SDAG-REAL16: ; %bb.0:
+; SDAG-REAL16-NEXT: s_endpgm
+;
+; SDAG-FAKE16-LABEL: tanh_undef_f16:
+; SDAG-FAKE16: ; %bb.0:
+; SDAG-FAKE16-NEXT: s_endpgm
+ %tanh = call half @llvm.amdgcn.tanh.f16(half undef)
+ store half %tanh, ptr addrspace(1) %out, align 2
+ ret void
+}
+
define amdgpu_kernel void @tanh_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
; SDAG-REAL16-LABEL: tanh_bf16:
; SDAG-REAL16: ; %bb.0:
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
index f9e217d1f0361..279bb262bff04 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
@@ -73,6 +73,51 @@ v_tanh_f32 v5, src_scc
v_tanh_f32 v255, 0xaf123456
// GFX1250: v_tanh_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x3c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+v_tanh_f16 v5, v1
+// GFX1250: v_tanh_f16_e32 v5, v1 ; encoding: [0x01,0x3f,0x0a,0x7e]
+
+v_tanh_f16 v5, v127
+// GFX1250: v_tanh_f16_e32 v5, v127 ; encoding: [0x7f,0x3f,0x0a,0x7e]
+
+v_tanh_f16 v5, s1
+// GFX1250: v_tanh_f16_e32 v5, s1 ; encoding: [0x01,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, s105
+// GFX1250: v_tanh_f16_e32 v5, s105 ; encoding: [0x69,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, vcc_lo
+// GFX1250: v_tanh_f16_e32 v5, vcc_lo ; encoding: [0x6a,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, vcc_hi
+// GFX1250: v_tanh_f16_e32 v5, vcc_hi ; encoding: [0x6b,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, ttmp15
+// GFX1250: v_tanh_f16_e32 v5, ttmp15 ; encoding: [0x7b,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, m0
+// GFX1250: v_tanh_f16_e32 v5, m0 ; encoding: [0x7d,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, exec_lo
+// GFX1250: v_tanh_f16_e32 v5, exec_lo ; encoding: [0x7e,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, exec_hi
+// GFX1250: v_tanh_f16_e32 v5, exec_hi ; encoding: [0x7f,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, null
+// GFX1250: v_tanh_f16_e32 v5, null ; encoding: [0x7c,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, -1
+// GFX1250: v_tanh_f16_e32 v5, -1 ; encoding: [0xc1,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, 0.5
+// GFX1250: v_tanh_f16_e32 v5, 0.5 ; encoding: [0xf0,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, src_scc
+// GFX1250: v_tanh_f16_e32 v5, src_scc ; encoding: [0xfd,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v127, 0x8000
+// GFX1250: v_tanh_f16_e32 v127, 0x8000 ; encoding: [0xff,0x3e,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
v_tanh_bf16 v5, v1
// GFX1250: v_tanh_bf16_e32 v5, v1 ; encoding: [0x01,0x95,0x0a,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
index d51ef68bf1e19..76272d25d92d4 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1.s
@@ -73,6 +73,54 @@ v_tanh_f32 v5, src_scc
v_tanh_f32 v255, 0xaf123456
// GFX1250: v_tanh_f32_e32 v255, 0xaf123456 ; encoding: [0xff,0x3c,0xfe,0x7f,0x56,0x34,0x12,0xaf]
+v_tanh_f16 v5, v1
+// GFX1250: v_tanh_f16_e32 v5, v1 ; encoding: [0x01,0x3f,0x0a,0x7e]
+
+v_tanh_f16 v5, v127
+// GFX1250: v_tanh_f16_e32 v5, v127 ; encoding: [0x7f,0x3f,0x0a,0x7e]
+
+v_tanh_f16 v5, s1
+// GFX1250: v_tanh_f16_e32 v5, s1 ; encoding: [0x01,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, s105
+// GFX1250: v_tanh_f16_e32 v5, s105 ; encoding: [0x69,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, vcc_lo
+// GFX1250: v_tanh_f16_e32 v5, vcc_lo ; encoding: [0x6a,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, vcc_hi
+// GFX1250: v_tanh_f16_e32 v5, vcc_hi ; encoding: [0x6b,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, ttmp15
+// GFX1250: v_tanh_f16_e32 v5, ttmp15 ; encoding: [0x7b,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, m0
+// GFX1250: v_tanh_f16_e32 v5, m0 ; encoding: [0x7d,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, exec_lo
+// GFX1250: v_tanh_f16_e32 v5, exec_lo ; encoding: [0x7e,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, exec_hi
+// GFX1250: v_tanh_f16_e32 v5, exec_hi ; encoding: [0x7f,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, null
+// GFX1250: v_tanh_f16_e32 v5, null ; encoding: [0x7c,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, -1
+// GFX1250: v_tanh_f16_e32 v5, -1 ; encoding: [0xc1,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, 0.5
+// GFX1250: v_tanh_f16_e32 v5, 0.5 ; encoding: [0xf0,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v5, src_scc
+// GFX1250: v_tanh_f16_e32 v5, src_scc ; encoding: [0xfd,0x3e,0x0a,0x7e]
+
+v_tanh_f16 v127, 0x8000
+// GFX1250: v_tanh_f16_e32 v127, 0x8000 ; encoding: [0xff,0x3e,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
+v_tanh_f16 v5.h, v1.h
+// GFX1250: v_tanh_f16_e32 v5.h, v1.h ; encoding: [0x81,0x3f,0x0a,0x7f]
+
v_tanh_bf16 v5, v1
// GFX1250: v_tanh_bf16_e32 v5, v1 ; encoding: [0x01,0x95,0x0a,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s
index ae22f68e54835..0a8ee84561d33 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16-fake16.s
@@ -58,6 +58,62 @@ v_tanh_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi
// GFX1250: v_tanh_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3c,0xfe,0x7f,0xff,0x6f,0x35,0x30]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+v_tanh_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_tanh_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_mirror
+// GFX1250: v_tanh_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_half_mirror
+// GFX1250: v_tanh_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shl:1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shl:15
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shr:1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shr:15
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_ror:1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_ror:15
+// GFX1250: v_tanh_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_tanh_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x5f,0x01,0x01]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX1250: v_tanh_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x60,0x09,0x13]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX1250: v_tanh_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3e,0xfe,0x7e,0x7f,0x6f,0x35,0x30]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
v_tanh_bf16 v5, v1 quad_perm:[3,2,1,0]
// GFX1250: v_tanh_bf16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x94,0x0a,0x7e,0x01,0x1b,0x00,0xff]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s
index 37ecb66bfe809..d4afb9d9b2d9a 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1_dpp16.s
@@ -58,6 +58,66 @@ v_tanh_f32 v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi
// GFX1250: v_tanh_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 fi:1 ; encoding: [0xfa,0x3c,0xfe,0x7f,0xff,0x6f,0x35,0x30]
// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+v_tanh_f16 v5, v1 quad_perm:[3,2,1,0]
+// GFX1250: v_tanh_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 quad_perm:[0,1,2,3]
+// GFX1250: v_tanh_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_mirror
+// GFX1250: v_tanh_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x40,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_half_mirror
+// GFX1250: v_tanh_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x41,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shl:1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x01,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shl:15
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shr:1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x11,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_shr:15
+// GFX1250: v_tanh_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_ror:1
+// GFX1250: v_tanh_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x21,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_ror:15
+// GFX1250: v_tanh_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+
+v_tanh_f16 v5, v1 row_share:0 row_mask:0xf bank_mask:0xf
+// GFX1250: v_tanh_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x3e,0x0a,0x7e,0x01,0x50,0x01,0xff]
+// GFX12-ERR: :[[@LINE-2]]:1: error: instruction not suppor...
[truncated]
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 'HEAD~1' HEAD clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.llThe following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
}Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
}Please refer to the Undefined Behavior Manual for more information. |
|
Is the "Undef" in the test to be taken care of? |
I ignored all of them. Plan to do it after all upstreaming of VOP1 instructions to avoid creating any unnecessary conflicts during downstream merge. |
| return Builder.CreateCall(F, { Src }); | ||
| } | ||
| case AMDGPU::BI__builtin_amdgcn_tanhf: | ||
| case AMDGPU::BI__builtin_amdgcn_tanhh: |
There was a problem hiding this comment.
can we just use llvm.tanh since it exists now?
There was a problem hiding this comment.
I have a TODO list that will go over all newly added VOP1 instructions again and add missing tests and support accordingly.

Co-authored-by: Mekhanoshin, Stanislav Stanislav.Mekhanoshin@amd.com