Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
209 changes: 201 additions & 8 deletions compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,27 @@
return linalg::inferContractionDims(maps);
}

static int getBlockSize(MMAIntrinsic /*intrinsic*/) {
// Not supporting any block size other than 1 at the moment.
return 1;
static int getBlockSize(MMAIntrinsic intrinsic) {
switch (intrinsic) {
case MMAIntrinsic::MFMA_F32_4x4x4x16B_F16:
case MMAIntrinsic::MFMA_F32_4x4x4x16B_BF16:
case MMAIntrinsic::MFMA_I32_4x4x4x16B_I8:
case MMAIntrinsic::MFMA_F32_4x4x1x16B_F32:
return 16;
case MMAIntrinsic::MFMA_F32_16x16x4x4B_F16:
case MMAIntrinsic::MFMA_F32_16x16x4x4B_BF16:
case MMAIntrinsic::MFMA_I32_16x16x4x4B_I8:
case MMAIntrinsic::MFMA_F32_16x16x1x4B_F32:
case MMAIntrinsic::MFMA_F64_4x4x4x4B_F64:
return 4;
case MMAIntrinsic::MFMA_F32_32x32x4x2B_F16:
case MMAIntrinsic::MFMA_F32_32x32x4x2B_BF16:
case MMAIntrinsic::MFMA_I32_32x32x4x2B_I8:
case MMAIntrinsic::MFMA_F32_32x32x1x2B_F32:
return 2;
default:
return 1;
}
}

static uint32_t getArchID(MMAIntrinsic intrinsic) {
Expand Down Expand Up @@ -118,10 +136,17 @@
Type i32 = IntegerType::get(context, 32);
switch (intrinsic) {
case MMAIntrinsic::MFMA_F64_16x16x4_F64:
case MMAIntrinsic::MFMA_F64_4x4x4x4B_F64:
return {f64, f64, f64};
case MMAIntrinsic::MFMA_F32_16x16x4_F32:
case MMAIntrinsic::MFMA_F32_4x4x1x16B_F32:
case MMAIntrinsic::MFMA_F32_16x16x1x4B_F32:
case MMAIntrinsic::MFMA_F32_32x32x1x2B_F32:
case MMAIntrinsic::WMMA_F32_16x16x4_F32:
return {f32, f32, f32};
case MMAIntrinsic::MFMA_F32_4x4x4x16B_F16:
case MMAIntrinsic::MFMA_F32_16x16x4x4B_F16:
case MMAIntrinsic::MFMA_F32_32x32x4x2B_F16:
case MMAIntrinsic::MFMA_F32_16x16x16_F16:
case MMAIntrinsic::MFMA_F32_32x32x8_F16:
case MMAIntrinsic::MFMA_F32_16x16x32_F16:
Expand All @@ -138,6 +163,9 @@
case MMAIntrinsic::NV_MMA_SYNC_F16_16x8x16_F16:
case MMAIntrinsic::WMMA_F16_16x16x32_F16:
return {f16, f16, f16};
case MMAIntrinsic::MFMA_F32_4x4x4x16B_BF16:
case MMAIntrinsic::MFMA_F32_16x16x4x4B_BF16:
case MMAIntrinsic::MFMA_F32_32x32x4x2B_BF16:
case MMAIntrinsic::MFMA_F32_16x16x8_BF16:
case MMAIntrinsic::MFMA_F32_32x32x4_BF16:
case MMAIntrinsic::MFMA_F32_16x16x16_BF16:
Expand Down Expand Up @@ -208,6 +236,9 @@
case MMAIntrinsic::WMMA_F16_16x16x64_F8E4M3FN_F8E5M2:
case MMAIntrinsic::WMMA_F16_16x16x128_F8E4M3FN_F8E5M2:
return {f8E4M3FN, f8E5M2, f16};
case MMAIntrinsic::MFMA_I32_16x16x4x4B_I8:
case MMAIntrinsic::MFMA_I32_32x32x4x2B_I8:
case MMAIntrinsic::MFMA_I32_4x4x4x16B_I8:
case MMAIntrinsic::MFMA_I32_16x16x16_I8:
case MMAIntrinsic::MFMA_I32_32x32x8_I8:
case MMAIntrinsic::MFMA_I32_16x16x32_I8:
Expand Down Expand Up @@ -262,13 +293,73 @@
/*element=*/{k / 2, 1}};
};

// For 4x4 blocked MFMAs (16 blocks).
// Uses 3D layout with semantic dimensions [Block, M, K] for LHS.
auto mfmaLhs4xKx16B = [](int64_t k) -> MMASingleSubgroupLayout {
return {/*outer=*/{1, 1, 1}, /*thread=*/{16, 4, 1}, /*tstrides=*/{4, 1, 64},
/*element=*/{1, 1, k}};
};
// Semantic dimensions [Block, K, N] for RHS.
auto mfmaRhsKx4x16B = [](int64_t k) -> MMASingleSubgroupLayout {
return {/*outer=*/{1, 1, 1}, /*thread=*/{16, 1, 4}, /*tstrides=*/{4, 64, 1},
/*element=*/{1, k, 1}};
};

auto mfmaLhs16xKx4B = [](int64_t k) -> MMASingleSubgroupLayout {
return {/*outer=*/{1, 1, 1}, /*thread=*/{4, 16, 1},
/*tstrides=*/{16, 1, 64},
/*element=*/{1, 1, k}};
};

auto mfmaRhsKx16x4B = [](int64_t k) -> MMASingleSubgroupLayout {
return {/*outer=*/{1, 1, 1}, /*thread=*/{4, 1, 16},
/*tstrides=*/{16, 64, 1},
/*element=*/{1, k, 1}};
};

const MMASingleSubgroupLayout mfmaAcc16x16x4B = {
/*outer=*/{1, 1, 1}, /*thread=*/{1, 4, 16}, /*tstrides=*/{64, 16, 1},
/*element=*/{4, 4, 1}};

auto mfmaLhs32xKx2B = [](int64_t k) -> MMASingleSubgroupLayout {
return {/*outer=*/{1, 1, 1}, /*thread=*/{2, 32, 1},
/*tstrides=*/{32, 1, 64},
/*element=*/{1, 1, k}};
};

auto mfmaRhsKx32x2B = [](int64_t k) -> MMASingleSubgroupLayout {
return {/*outer=*/{1, 1, 1}, /*thread=*/{2, 1, 32},
/*tstrides=*/{32, 64, 1},
/*element=*/{1, k, 1}};
};

const MMASingleSubgroupLayout mfmaAcc32x32x2B = {
/*outer=*/{1, 4, 1}, /*thread=*/{1, 2, 32}, /*tstrides=*/{64, 32, 1},
/*element=*/{2, 4, 1}};

const MMASingleSubgroupLayout mfmaAcc16x16 = {
/*outer=*/{1, 1}, /*thread=*/{4, 16}, /*tstrides=*/{16, 1},
/*element=*/{4, 1}};
const MMASingleSubgroupLayout mfmaAcc32x32 = {
/*outer=*/{4, 1}, /*thread=*/{2, 32}, /*tstrides=*/{32, 1},
/*element=*/{4, 1}};

const MMASingleSubgroupLayout mfmaAcc4x4x16B = {
/*outer=*/{1, 1, 1}, /*thread=*/{16, 1, 4}, /*tstrides=*/{4, 64, 1},
/*element=*/{1, 4, 1}};

const MMASingleSubgroupLayout mfmaF64Lhs4x4B = {
/*outer=*/{1, 1, 1}, /*thread=*/{4, 4, 4}, /*tstrides=*/{4, 1, 16},
/*element=*/{1, 1, 1}};

const MMASingleSubgroupLayout mfmaF64Rhs4x4B = {
/*outer=*/{1, 1, 1}, /*thread=*/{4, 4, 4}, /*tstrides=*/{4, 16, 1},
/*element=*/{1, 1, 1}};

const MMASingleSubgroupLayout mfmaF64Acc4x4B = {
/*outer=*/{1, 1, 1}, /*thread=*/{4, 4, 4}, /*tstrides=*/{4, 16, 1},
/*element=*/{1, 1, 1}};

// Note: For gfx12, we specify here that, for example with K=16, lane 0 takes
// A[0, 0..7] and that lane 16 takes A[0, 8..15]. The hardware will internally
// bounce between the low halves and high halves of lanes every two registers
Expand Down Expand Up @@ -296,9 +387,18 @@
case kMMAOperandAcc:
return mfmaAcc16x16;
}
case MMAIntrinsic::MFMA_F64_4x4x4x4B_F64:
switch (operandIndex) {

Check warning on line 391 in compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp

View workflow job for this annotation

GitHub Actions / clang-tidy

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp:391:5 [bugprone-switch-missing-default-case]

switching on non-enum value without default case may not cover all cases
case kMMAOperandLhs:
return mfmaF64Lhs4x4B;
case kMMAOperandRhs:
return mfmaF64Rhs4x4B;
case kMMAOperandAcc:
return mfmaF64Acc4x4B;
}
// Note: the returned layout for f64 differs than for other MFMAs.
case MMAIntrinsic::MFMA_F64_16x16x4_F64:
switch (operandIndex) {

Check warning on line 401 in compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp

View workflow job for this annotation

GitHub Actions / clang-tidy

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp:401:5 [bugprone-switch-missing-default-case]

switching on non-enum value without default case may not cover all cases
case kMMAOperandLhs:
return mfmaLhs16xK(4);
case kMMAOperandRhs:
Expand Down Expand Up @@ -326,10 +426,70 @@
case kMMAOperandAcc:
return mfmaAcc32x32;
}
case MMAIntrinsic::MFMA_F32_4x4x4x16B_F16:
case MMAIntrinsic::MFMA_F32_4x4x4x16B_BF16:
case MMAIntrinsic::MFMA_I32_4x4x4x16B_I8:
switch (operandIndex) {

Check warning on line 432 in compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp

View workflow job for this annotation

GitHub Actions / clang-tidy

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp:432:5 [bugprone-switch-missing-default-case]

switching on non-enum value without default case may not cover all cases
case kMMAOperandLhs:
return mfmaLhs4xKx16B(4);
case kMMAOperandRhs:
return mfmaRhsKx4x16B(4);
case kMMAOperandAcc:
return mfmaAcc4x4x16B;
}
case MMAIntrinsic::MFMA_F32_4x4x1x16B_F32:
switch (operandIndex) {

Check warning on line 441 in compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp

View workflow job for this annotation

GitHub Actions / clang-tidy

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp:441:5 [bugprone-switch-missing-default-case]

switching on non-enum value without default case may not cover all cases
case kMMAOperandLhs:
return mfmaLhs4xKx16B(1);
case kMMAOperandRhs:
return mfmaRhsKx4x16B(1);
case kMMAOperandAcc:
return mfmaAcc4x4x16B;
}
case MMAIntrinsic::MFMA_F32_16x16x4x4B_F16:
case MMAIntrinsic::MFMA_F32_16x16x4x4B_BF16:
case MMAIntrinsic::MFMA_I32_16x16x4x4B_I8:
switch (operandIndex) {

Check warning on line 452 in compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp

View workflow job for this annotation

GitHub Actions / clang-tidy

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp:452:5 [bugprone-switch-missing-default-case]

switching on non-enum value without default case may not cover all cases
case kMMAOperandLhs:
return mfmaLhs16xKx4B(4);
case kMMAOperandRhs:
return mfmaRhsKx16x4B(4);
case kMMAOperandAcc:
return mfmaAcc16x16x4B;
}
case MMAIntrinsic::MFMA_F32_16x16x1x4B_F32:
switch (operandIndex) {

Check warning on line 461 in compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp

View workflow job for this annotation

GitHub Actions / clang-tidy

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp:461:5 [bugprone-switch-missing-default-case]

switching on non-enum value without default case may not cover all cases
case kMMAOperandLhs:
return mfmaLhs16xKx4B(1);
case kMMAOperandRhs:
return mfmaRhsKx16x4B(1);
case kMMAOperandAcc:
return mfmaAcc16x16x4B;
}
case MMAIntrinsic::MFMA_F32_32x32x4x2B_F16:
case MMAIntrinsic::MFMA_F32_32x32x4x2B_BF16:
case MMAIntrinsic::MFMA_I32_32x32x4x2B_I8:
switch (operandIndex) {

Check warning on line 472 in compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp

View workflow job for this annotation

GitHub Actions / clang-tidy

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp:472:5 [bugprone-switch-missing-default-case]

switching on non-enum value without default case may not cover all cases
case kMMAOperandLhs:
return mfmaLhs32xKx2B(4);
case kMMAOperandRhs:
return mfmaRhsKx32x2B(4);
case kMMAOperandAcc:
return mfmaAcc32x32x2B;
}
case MMAIntrinsic::MFMA_F32_32x32x1x2B_F32:
switch (operandIndex) {

Check warning on line 481 in compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp

View workflow job for this annotation

GitHub Actions / clang-tidy

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp:481:5 [bugprone-switch-missing-default-case]

switching on non-enum value without default case may not cover all cases
case kMMAOperandLhs:
return mfmaLhs32xKx2B(1);
case kMMAOperandRhs:
return mfmaRhsKx32x2B(1);
case kMMAOperandAcc:
return mfmaAcc32x32x2B;
}
case MMAIntrinsic::MFMA_I32_16x16x16_I8:
case MMAIntrinsic::MFMA_F32_16x16x16_F16:
case MMAIntrinsic::MFMA_F32_16x16x16_BF16:
switch (operandIndex) {

Check warning on line 492 in compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp

View workflow job for this annotation

GitHub Actions / clang-tidy

compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp:492:5 [bugprone-switch-missing-default-case]

switching on non-enum value without default case may not cover all cases
case kMMAOperandLhs:
return mfmaLhs16xK(16);
case kMMAOperandRhs:
Expand Down Expand Up @@ -558,7 +718,8 @@
bool colMajor) {
MMASingleSubgroupLayout baseLayout =
getSingleSubgroupLayout(intrinsic, operandIndex);
assert(baseLayout.element.size() == 2 && "expected 2d layout");
assert((baseLayout.element.size() == 2 || baseLayout.element.size() == 3) &&
"expected 2d or 3d layout");
if (colMajor) {
std::swap(baseLayout.element[0], baseLayout.element[1]);
std::swap(baseLayout.thread[0], baseLayout.thread[1]);
Expand All @@ -585,6 +746,7 @@

// Struct describing the shape of a MMA operation, but not the detailed layout.
struct OpaqueMmaLayout {
int64_t bSize = 0; // 0 for non-block intrinsics
int64_t mSize = 0;
int64_t nSize = 0;
int64_t kSize = 0;
Expand All @@ -601,6 +763,16 @@
}
auto lhs = getSingleSubgroupLayout(intrinsic, kMMAOperandLhs);
auto rhs = getSingleSubgroupLayout(intrinsic, kMMAOperandRhs);

// For 3D layouts with batch/block dimension, skip index 0 (batch)
// and use indices 1,2 for M/N/K computation
bool has3DLayout = lhs.outer.size() == 3;
if (has3DLayout) {
return {lhs.outer[1] * lhs.thread[1] * lhs.element[1],
rhs.outer[2] * rhs.thread[2] * rhs.element[2],
lhs.outer[2] * lhs.thread[2] * lhs.element[2]};
}

return {lhs.outer[0] * lhs.thread[0] * lhs.element[0],
rhs.outer[1] * rhs.thread[1] * rhs.element[1],
lhs.outer[1] * lhs.thread[1] * lhs.element[1]};
Expand All @@ -621,6 +793,10 @@
OpaqueMmaLayout o;
std::tie(o.aType, o.bType, o.cType) = getABCElementTypes(context, intrinsic);
std::tie(o.mSize, o.nSize, o.kSize) = getMNKShapeFromIntrinsic(intrinsic);
auto lhs = getSingleSubgroupLayout(intrinsic, kMMAOperandLhs);
if (lhs.outer.size() == 3) {
o.bSize = lhs.outer[0] * lhs.thread[0] * lhs.element[0];
}
return o;
}

Expand Down Expand Up @@ -678,9 +854,15 @@
SmallVectorImpl<VectorType> &result) const {
MLIRContext *ctx = getContext();
OpaqueMmaLayout o = getOpaqueMMALayout(ctx, getIntrinsic());
result.assign({VectorType::get({o.mSize, o.kSize}, o.aType),
VectorType::get({o.kSize, o.nSize}, o.bType),
VectorType::get({o.mSize, o.nSize}, o.cType)});
if (o.bSize) {
result.assign({VectorType::get({o.bSize, o.mSize, o.kSize}, o.aType),
VectorType::get({o.bSize, o.kSize, o.nSize}, o.bType),
VectorType::get({o.bSize, o.mSize, o.nSize}, o.cType)});
} else {
result.assign({VectorType::get({o.mSize, o.kSize}, o.aType),
VectorType::get({o.kSize, o.nSize}, o.bType),
VectorType::get({o.mSize, o.nSize}, o.cType)});
}
}

template <typename MMAIntrinsicType>
Expand All @@ -698,6 +880,11 @@
{s.outer[0] * s.outer[1], s.element[0] * s.element[1]}, elemType);
}
}
if (s.outer.size() == 3) {
return VectorType::get({s.outer[0] * s.element[0] * s.outer[1] *
s.element[1] * s.outer[2] * s.element[2]},
elemType);
}
return VectorType::get(
{s.element[0] * s.element[1] * s.outer[0] * s.outer[1]}, elemType);
}
Expand All @@ -715,7 +902,7 @@
MMAAttr::getUndistributedTileDimExpansion(int64_t operandIndex,
int64_t dim) const {
assert(operandIndex <= 2 && "invalid operand index");
assert(dim < 2 && "pre-expansion inner tiles all have two elements");
assert(dim < 3 && "invalid inner tile dim");
MMASingleSubgroupLayout layout =
getSingleSubgroupLayout(*this, static_cast<int>(operandIndex));
if (layout.outer[dim] > 1) {
Expand All @@ -729,6 +916,12 @@
return IREE::GPU::getBlockSize(getIntrinsic());
}

bool MMAAttr::isBlockIntrinsic() const {
MMASingleSubgroupLayout lhs =
getSingleSubgroupLayout(getIntrinsic(), kMMAOperandLhs);
return lhs.outer.size() == 3;
}

int64_t MMAAttr::getSubgroupSize() const {
return getIntrinsicSubgroupSize(getIntrinsic());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,8 @@ def IREEGPU_MMAAttr : AttrDef<IREEGPU_Dialect, "MMA", [
int64_t getBlockSize() const;

SmallVector<VirtualMMAIntrinsic> getVirtualIntrinsics() const;

bool isBlockIntrinsic() const;
}];
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,20 @@ def MFMA_F32_32x32x16_F8E4M3FNUZ : I32EnumAttrCase<"MFMA_F32_32x32x16_F8E4M3FNUZ
def MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ : I32EnumAttrCase<"MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ", 0x1237>;
def MFMA_I32_16x16x32_I8 : I32EnumAttrCase<"MFMA_I32_16x16x32_I8", 0x12C0>;
def MFMA_I32_32x32x16_I8 : I32EnumAttrCase<"MFMA_I32_32x32x16_I8", 0x12C1>;
// Block intrinsics
def MFMA_F32_4x4x4x16B_F16 : I32EnumAttrCase<"MFMA_F32_4x4x4x16B_F16", 0x1222>;
def MFMA_F32_16x16x4x4B_F16 : I32EnumAttrCase<"MFMA_F32_16x16x4x4B_F16", 0x1223>;
def MFMA_F32_32x32x4x2B_F16 : I32EnumAttrCase<"MFMA_F32_32x32x4x2B_F16", 0x1224>;
def MFMA_F32_4x4x4x16B_BF16 : I32EnumAttrCase<"MFMA_F32_4x4x4x16B_BF16", 0x1225>;
def MFMA_F32_16x16x4x4B_BF16 : I32EnumAttrCase<"MFMA_F32_16x16x4x4B_BF16", 0x1226>;
def MFMA_F32_32x32x4x2B_BF16 : I32EnumAttrCase<"MFMA_F32_32x32x4x2B_BF16", 0x1227>;
def MFMA_F32_4x4x1x16B_F32 : I32EnumAttrCase<"MFMA_F32_4x4x1x16B_F32", 0x1210>;
def MFMA_F32_16x16x1x4B_F32 : I32EnumAttrCase<"MFMA_F32_16x16x1x4B_F32", 0x1211>;
def MFMA_F32_32x32x1x2B_F32 : I32EnumAttrCase<"MFMA_F32_32x32x1x2B_F32", 0x1212>;
def MFMA_F64_4x4x4x4B_F64 : I32EnumAttrCase<"MFMA_F64_4x4x4x4B_F64", 0x1200>;
def MFMA_I32_4x4x4x16B_I8 : I32EnumAttrCase<"MFMA_I32_4x4x4x16B_I8", 0x12C2>;
def MFMA_I32_16x16x4x4B_I8 : I32EnumAttrCase<"MFMA_I32_16x16x4x4B_I8", 0x12C3>;
def MFMA_I32_32x32x4x2B_I8 : I32EnumAttrCase<"MFMA_I32_32x32x4x2B_I8", 0x12C4>;

// Introduced in CDNA4.
def MFMA_F32_16x16x32_F16 : I32EnumAttrCase<"MFMA_F32_16x16x32_F16", 0x1320>;
Expand Down Expand Up @@ -290,6 +304,20 @@ def IREEGPU_MMAIntrinsic : IREEGPU_I32EnumAttr<"MMAIntrinsic",
MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ,
MFMA_I32_16x16x32_I8,
MFMA_I32_32x32x16_I8,
// Block intrinsics.
MFMA_F32_4x4x4x16B_F16,
MFMA_F32_16x16x4x4B_F16,
MFMA_F32_32x32x4x2B_F16,
MFMA_F32_4x4x4x16B_BF16,
MFMA_F32_16x16x4x4B_BF16,
MFMA_F32_32x32x4x2B_BF16,
MFMA_F32_4x4x1x16B_F32,
MFMA_F32_16x16x1x4B_F32,
MFMA_F32_32x32x1x2B_F32,
MFMA_F64_4x4x4x4B_F64,
MFMA_I32_4x4x4x16B_I8,
MFMA_I32_16x16x4x4B_I8,
MFMA_I32_32x32x4x2B_I8,

// Introduced in CDNA4.
MFMA_F32_16x16x32_F16,
Expand Down
Loading
Loading