From 2bb75f5eef1f66bb359e494cd3d672b82d189b9f Mon Sep 17 00:00:00 2001 From: SingleAccretion Date: Tue, 3 Mar 2026 00:03:55 +0300 Subject: [PATCH 1/2] [RyuJit/WASM] Insert temporaries when needed in the stackifier To do this we need to maintain some more state, so convert the stackifier into a full walk over all blocks so that we can encapsulate this state better. --- src/coreclr/jit/lower.cpp | 7 +- src/coreclr/jit/lower.h | 15 +- src/coreclr/jit/lowerwasm.cpp | 238 +++++++++++++++++++++------- src/coreclr/jit/regallocwasm.cpp | 11 +- src/coreclr/jit/registeropswasm.cpp | 40 +++++ src/coreclr/jit/registeropswasm.h | 1 + 6 files changed, 234 insertions(+), 78 deletions(-) diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 95be5933bcdfed..d443484e6cdd09 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -8911,6 +8911,8 @@ PhaseStatus Lowering::DoPhase() LowerBlock(block); } + AfterLowerBlocks(); + #ifdef DEBUG JITDUMP("Lower has completed modifying nodes.\n"); if (VERBOSE) @@ -9545,16 +9547,15 @@ void Lowering::LowerBlock(BasicBlock* block) { node = LowerNode(node); } - AfterLowerBlock(); assert(CheckBlock(m_compiler, block)); } #ifndef TARGET_WASM //------------------------------------------------------------------------ -// AfterLowerBlock: target-specific post-processing of the lowered block. +// AfterLowerBlocks: target-specific post-processing of the lowered blocks. // -void Lowering::AfterLowerBlock() +void Lowering::AfterLowerBlocks() { } #endif // !TARGET_WASM diff --git a/src/coreclr/jit/lower.h b/src/coreclr/jit/lower.h index dbf84a85be10ed..b135a9a3109e1c 100644 --- a/src/coreclr/jit/lower.h +++ b/src/coreclr/jit/lower.h @@ -27,9 +27,6 @@ class Lowering final : public Phase , vtableCallTemp(BAD_VAR_NUM) #ifdef TARGET_ARM64 , m_blockIndirs(compiler->getAllocator(CMK_Lower)) -#endif -#ifdef TARGET_WASM - , m_stackificationStack(compiler->getAllocator(CMK_Lower)) #endif { m_regAlloc = static_cast(regAlloc); @@ -142,7 +139,7 @@ class Lowering final : public Phase unsigned TryReuseLocalForParameterAccess(const LIR::Use& use, const LocalSet& storedToLocals); void LowerBlock(BasicBlock* block); - void AfterLowerBlock(); + void AfterLowerBlocks(); GenTree* LowerNode(GenTree* node); bool IsCFGCallArgInvariantInRange(GenTree* node, GenTree* endExclusive); @@ -634,16 +631,6 @@ class Lowering final : public Phase ArrayStack m_blockIndirs; bool m_ffrTrashed; #endif - -#ifdef TARGET_WASM - ArrayStack m_stackificationStack; - - static void SetMultiplyUsed(GenTree* node) - { - assert(node->gtType != TYP_STRUCT); - node->gtLIRFlags |= LIR::Flags::MultiplyUsed; - } -#endif }; #endif // _LOWER_H_ diff --git a/src/coreclr/jit/lowerwasm.cpp b/src/coreclr/jit/lowerwasm.cpp index 752a7c52c590a3..c2b5946ee152f5 100644 --- a/src/coreclr/jit/lowerwasm.cpp +++ b/src/coreclr/jit/lowerwasm.cpp @@ -22,6 +22,13 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX #include "lower.h" +static void SetMultiplyUsed(GenTree* node) +{ + assert(varTypeIsEnregisterable(node)); + assert(!node->isContained()); + node->gtLIRFlags |= LIR::Flags::MultiplyUsed; +} + //------------------------------------------------------------------------ // IsCallTargetInRange: Can a call target address be encoded in-place? // @@ -453,7 +460,7 @@ void Lowering::ContainCheckSelect(GenTreeOp* node) } //------------------------------------------------------------------------ -// AfterLowerBlock: stackify the nodes in this block. +// AfterLowerBlocks: stackify the nodes in all blocks. // // Stackification involves moving nodes around and inserting temporaries // as necessary. We expect the vast majority of IR to already be in correct @@ -463,61 +470,69 @@ void Lowering::ContainCheckSelect(GenTreeOp* node) // the introduced temporaries can get enregistered and the last-use info // on LCL_VAR nodes in RA is readily correct. // -void Lowering::AfterLowerBlock() +void Lowering::AfterLowerBlocks() { + struct Temporary + { + unsigned LclNum; + Temporary* Prev = nullptr; + }; + class Stackifier { - Lowering* m_lower; - bool m_anyChanges = false; + Lowering* m_lower; + Compiler* m_compiler; + ArrayStack m_stack; + unsigned m_minimumTempLclNum; + Temporary* m_availableTemps[static_cast(WasmValueType::Count)] = {}; + Temporary* m_unusedTemps[static_cast(WasmValueType::Count)] = {}; + bool m_anyChanges = false; public: Stackifier(Lowering* lower) : m_lower(lower) + , m_compiler(lower->m_compiler) + , m_stack(m_compiler->getAllocator(CMK_Lower)) + , m_minimumTempLclNum(m_compiler->lvaCount) { } - void StackifyCurrentBlock() + void StackifyBlock(BasicBlock* block) { - GenTree* node = m_lower->BlockRange().LastNode(); + m_anyChanges = false; + m_lower->m_block = block; + GenTree* node = block->lastNode(); while (node != nullptr) { assert(IsDataFlowRoot(node)); node = StackifyTree(node); } + m_lower->m_block = nullptr; - if (!m_anyChanges) - { - JITDUMP(FMT_BB ": already in WASM value stack order\n", m_lower->m_block->bbNum); - } - } - - bool CanMoveNodePast(GenTree* node, GenTree* past) - { - bool result = node->IsInvariant() || node->isContained() || - (node->OperIs(GT_LCL_VAR) && - !m_lower->m_compiler->lvaGetDesc(node->AsLclVarCommon())->IsAddressExposed()); + JITDUMP(FMT_BB ": %s\n", block->bbNum, + m_anyChanges ? "stackified with some changes" : "already in WASM value stack order"); - if (result) + for (WasmValueType type = WasmValueType::First; type < WasmValueType::Count; ++type) { - assert(m_lower->IsInvariantInRange(node, past)); + assert((m_unusedTemps[static_cast(type)] == nullptr) && "Some temporaries were not released"); } - - return result; } GenTree* StackifyTree(GenTree* root) { - ArrayStack* stack = &m_lower->m_stackificationStack; - int initialDepth = stack->Height(); + int initialDepth = m_stack.Height(); // Simple greedy algorithm working backwards. The invariant is that the stack top must be placed right next // to (in normal linear order - before) the node we last stackified. - stack->Push(root); - GenTree* current = root->gtNext; - while (stack->Height() != initialDepth) + m_stack.Push(&root); + ReleaseTemporariesDefinedBy(root); + + GenTree* lastStackified = root->gtNext; + while (m_stack.Height() != initialDepth) { - GenTree* node = stack->Pop(); - GenTree* prev = (current != nullptr) ? current->gtPrev : root; + GenTree** use = m_stack.Pop(); + GenTree* node = *use; + GenTree* prev = (lastStackified != nullptr) ? lastStackified->gtPrev : root; while (node != prev) { // Maybe this is an intervening void-equivalent node that we can also just stackify. @@ -530,51 +545,164 @@ void Lowering::AfterLowerBlock() // At this point, we'll have to modify the IR in some way. In general, these cases should be quite // rare, introduced in lowering only. All HIR-induced cases (such as from "gtSetEvalOrder") should // instead be ifdef-ed out for WASM. - m_anyChanges = true; - - // Invariant nodes can be safely moved by the stackifier with no side effects. - // For other nodes, the side effects would require us to turn them into a temporary local, but this - // is not possible for contained nodes like an IND inside a STORE_BLK. However, the few types of - // contained nodes we have in Wasm should be safe to move freely since the lack of 'dup' or - // persistent registers in Wasm means that the actual codegen will trigger the side effect(s) and - // store the result into a Wasm local for any later uses during the containing node's execution, - // i.e. cpobj where the src and dest get stashed at the start and then used as add operands - // repeatedly. - // Locals can also be safely moved as long as they aren't address-exposed due to local var nodes - // being implicitly pseudo-contained. - // TODO-WASM: Verify that it is actually safe to do this for all contained nodes. - if (CanMoveNodePast(node, prev->gtNext)) + INDEBUG(const char* reason); + if (CanMoveForward(node DEBUGARG(&reason))) { - JITDUMP("Stackifier moving node [%06u] after [%06u]\n", Compiler::dspTreeID(node), - Compiler::dspTreeID(prev)); - m_lower->BlockRange().Remove(node); - m_lower->BlockRange().InsertAfter(prev, node); - break; + MoveForward(node, prev DEBUGARG(reason)); } - - JITDUMP("node==[%06u] prev==[%06u]\n", Compiler::dspTreeID(node), Compiler::dspTreeID(prev)); - NYI_WASM("IR not in a stackified form"); + else + { + node = ReplaceWithTemporary(use, prev); + } + m_anyChanges = true; + break; } // In stack order, the last operand is closest to its parent, thus put on top here. - node->VisitOperands([stack](GenTree* operand) { - stack->Push(operand); + node->VisitOperandUses([this](GenTree** use) { + m_stack.Push(use); return GenTree::VisitResult::Continue; }); - current = node; + lastStackified = node; } - return current->gtPrev; + return lastStackified->gtPrev; } bool IsDataFlowRoot(GenTree* node) { return !node->IsValue() || node->IsUnusedValue(); } + + bool CanMoveForward(GenTree* node DEBUGARG(const char** pReason)) + { + if (node->IsInvariant()) + { + // Leaf node without control or dataflow dependencies. + INDEBUG(*pReason = "invariant"); + return true; + } + + if (node->isContained()) + { + // Contained nodes are part of their parent so their position in the LIR stream in not significant. + // As a fiction that simplifies this algorithm, we move them to the place where they would be were + // they not contained. + INDEBUG(*pReason = "contained"); + return true; + } + + if (node->OperIs(GT_LCL_VAR) && !m_compiler->lvaGetDesc(node->AsLclVarCommon())->IsAddressExposed()) + { + // By IR invariants, there can be no intervening stores between a local's position in the LIR stream + // and its parent. So we can always move a local forward, closer to its parent. + INDEBUG(*pReason = "local"); + return true; + } + + // TODO-WASM: devise a less-than-quadratic (ideally linear) algorithm that would allow us to handle more + // complex cases here. + return false; + } + + void MoveForward(GenTree* node, GenTree* prev DEBUGARG(const char* reason)) + { + JITDUMP("Stackifier moving [%06u] after [%06u]: %s\n", Compiler::dspTreeID(node), Compiler::dspTreeID(prev), + reason); + assert(m_lower->IsInvariantInRange(node, prev->gtNext)); + m_lower->BlockRange().Remove(node); + m_lower->BlockRange().InsertAfter(prev, node); + } + + GenTree* ReplaceWithTemporary(GenTree** use, GenTree* prev) + { + GenTree* node = *use; + unsigned lclNum = RequestTemporary(node->TypeGet()); + GenTree* lclStore = m_compiler->gtNewStoreLclVarNode(lclNum, node); + GenTree* lclNode = m_compiler->gtNewLclVarNode(lclNum); + + m_lower->BlockRange().InsertAfter(node, lclStore); + m_lower->BlockRange().InsertAfter(prev, lclNode); + *use = lclNode; + + JITDUMP("Replaced [%06u] with a temporary:\n", Compiler::dspTreeID(node)); + DISPNODE(node); + DISPNODE(lclNode); + return lclNode; + } + + unsigned RequestTemporary(var_types type) + { + assert(varTypeIsEnregisterable(type)); + + unsigned lclNum; + unsigned index = static_cast(ActualTypeToWasmValueType(type)); + Temporary* local = Remove(&m_availableTemps[index]); + if (local != nullptr) + { + lclNum = local->LclNum; + Append(&m_unusedTemps[index], local); + assert(m_compiler->lvaGetDesc(lclNum)->TypeGet() == genActualType(type)); + } + else + { + lclNum = m_compiler->lvaGrabTemp(true DEBUGARG("Stackifier temporary")); + LclVarDsc* varDsc = m_compiler->lvaGetDesc(lclNum); + varDsc->lvType = genActualType(type); + assert(lclNum >= m_minimumTempLclNum); + } + JITDUMP("Temporary V%02u is now in use\n", lclNum); + return lclNum; + } + + void ReleaseTemporariesDefinedBy(GenTree* node) + { + assert(IsDataFlowRoot(node)); // We rely on the node not moving after this call. + if (!node->OperIs(GT_STORE_LCL_VAR)) + { + return; + } + + unsigned lclNum = node->AsLclVar()->GetLclNum(); + if (lclNum < m_minimumTempLclNum) + { + return; + } + + unsigned index = static_cast(ActualTypeToWasmValueType(node->TypeGet())); + Temporary* local = Remove(&m_unusedTemps[index]); + if (local == nullptr) + { + local = new (m_compiler, CMK_Lower) Temporary(); + } + local->LclNum = lclNum; + + JITDUMP("Temporary V%02u is now free and can be re-used\n", lclNum); + Append(&m_availableTemps[index], local); + } + + Temporary* Remove(Temporary** pTemps) + { + Temporary* local = *pTemps; + if (local != nullptr) + { + *pTemps = local->Prev; + } + return local; + } + + void Append(Temporary** pTemps, Temporary* local) + { + local->Prev = *pTemps; + *pTemps = local; + } }; Stackifier stackifier(this); - stackifier.StackifyCurrentBlock(); + for (BasicBlock* block : m_compiler->Blocks()) + { + stackifier.StackifyBlock(block); + } } //------------------------------------------------------------------------ diff --git a/src/coreclr/jit/regallocwasm.cpp b/src/coreclr/jit/regallocwasm.cpp index a69bf2917b861d..213dc25ac2dcc4 100644 --- a/src/coreclr/jit/regallocwasm.cpp +++ b/src/coreclr/jit/regallocwasm.cpp @@ -103,8 +103,7 @@ void WasmRegAlloc::IdentifyCandidates() // void WasmRegAlloc::InitializeCandidate(LclVarDsc* varDsc) { - var_types type = genActualType(varDsc->GetRegisterType()); - regNumber reg = AllocateVirtualRegister(type); + regNumber reg = AllocateVirtualRegister(varDsc->GetRegisterType()); varDsc->SetRegNum(reg); varDsc->lvLRACandidate = true; } @@ -173,7 +172,7 @@ void WasmRegAlloc::AllocateFramePointer() // regNumber WasmRegAlloc::AllocateVirtualRegister(var_types type) { - WasmValueType wasmType = TypeToWasmValueType(type); + WasmValueType wasmType = ActualTypeToWasmValueType(type); return AllocateVirtualRegister(wasmType); } @@ -213,7 +212,7 @@ regNumber WasmRegAlloc::AllocateVirtualRegister(WasmValueType type) // regNumber WasmRegAlloc::AllocateTemporaryRegister(var_types type) { - WasmValueType wasmType = TypeToWasmValueType(type); + WasmValueType wasmType = ActualTypeToWasmValueType(type); unsigned index = m_temporaryRegs[static_cast(wasmType)].Push(); return MakeWasmReg(index, wasmType); } @@ -592,7 +591,7 @@ void WasmRegAlloc::RequestTemporaryRegisterForMultiplyUsedNode(GenTree* node) // Note how due to the fact we're processing nodes in stack order, // we don't need to maintain free/busy sets, only a simple stack. - regNumber reg = AllocateTemporaryRegister(genActualType(node)); + regNumber reg = AllocateTemporaryRegister(node->TypeGet()); assert((node->GetRegNum() == REG_NA) && "Trying to double-assign a temporary register"); node->SetRegNum(reg); } @@ -637,7 +636,7 @@ void WasmRegAlloc::ConsumeTemporaryRegForOperand(GenTree* operand DEBUGARG(const // regNumber WasmRegAlloc::RequestInternalRegister(GenTree* node, var_types type) { - regNumber reg = AllocateTemporaryRegister(genActualType(type)); + regNumber reg = AllocateTemporaryRegister(type); m_codeGen->internalRegisters.Add(node, reg); return reg; } diff --git a/src/coreclr/jit/registeropswasm.cpp b/src/coreclr/jit/registeropswasm.cpp index 773fb2e9acec8c..6b8e5d1dbfe2be 100644 --- a/src/coreclr/jit/registeropswasm.cpp +++ b/src/coreclr/jit/registeropswasm.cpp @@ -92,6 +92,46 @@ WasmValueType TypeToWasmValueType(var_types type) return wasmType; } +//------------------------------------------------------------------------ +// ActualTypeToWasmValueType: Convert a 'var_types' value to a 'WasmValueType' value. +// +// A more direct way to do "TypeToWasmValueType(genActualType(type))". +// +// Arguments: +// type - The input type +// +// Return Value: +// The WASM type corresponding to 'type' (small types are mapped to I32). +// +WasmValueType ActualTypeToWasmValueType(var_types type) +{ + // clang-format off + static const WasmValueType s_mapping[] = { + WasmValueType::Invalid, // TYP_UNDEF, + WasmValueType::Invalid, // TYP_VOID, + WasmValueType::I32, // TYP_BYTE, + WasmValueType::I32, // TYP_UBYTE, + WasmValueType::I32, // TYP_SHORT, + WasmValueType::I32, // TYP_USHORT, + WasmValueType::I32, // TYP_INT, + WasmValueType::Invalid, // TYP_UINT, + WasmValueType::I64, // TYP_LONG, + WasmValueType::Invalid, // TYP_ULONG, + WasmValueType::F32, // TYP_FLOAT, + WasmValueType::F64, // TYP_DOUBLE, + WasmValueType::I, // TYP_REF, + WasmValueType::I, // TYP_BYREF, + WasmValueType::Invalid, // TYP_STRUCT + WasmValueType::Invalid, // TYP_UNKNOWN + }; + static_assert(ArrLen(s_mapping) == TYP_COUNT); + // clang-format on + + WasmValueType wasmType = s_mapping[type]; + assert(wasmType != WasmValueType::Invalid); + return wasmType; +} + const char* WasmValueTypeName(WasmValueType type) { // clang-format off diff --git a/src/coreclr/jit/registeropswasm.h b/src/coreclr/jit/registeropswasm.h index b485756bc4ad5b..e17846e7162154 100644 --- a/src/coreclr/jit/registeropswasm.h +++ b/src/coreclr/jit/registeropswasm.h @@ -26,6 +26,7 @@ inline WasmValueType& operator++(WasmValueType& type) regNumber MakeWasmReg(unsigned index, var_types type); regNumber MakeWasmReg(unsigned index, WasmValueType type); WasmValueType TypeToWasmValueType(var_types type); +WasmValueType ActualTypeToWasmValueType(var_types type); const char* WasmValueTypeName(WasmValueType type); unsigned UnpackWasmReg(regNumber reg, WasmValueType* pType = nullptr); unsigned WasmRegToIndex(regNumber reg); From 9c68fb86b286982e8abe5735d5c57bc6bb3acaeb Mon Sep 17 00:00:00 2001 From: SingleAccretion Date: Thu, 5 Mar 2026 14:13:53 +0300 Subject: [PATCH 2/2] Simplify the free list, add comments --- src/coreclr/jit/lowerwasm.cpp | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/src/coreclr/jit/lowerwasm.cpp b/src/coreclr/jit/lowerwasm.cpp index c2b5946ee152f5..fe7237eb098070 100644 --- a/src/coreclr/jit/lowerwasm.cpp +++ b/src/coreclr/jit/lowerwasm.cpp @@ -485,7 +485,7 @@ void Lowering::AfterLowerBlocks() ArrayStack m_stack; unsigned m_minimumTempLclNum; Temporary* m_availableTemps[static_cast(WasmValueType::Count)] = {}; - Temporary* m_unusedTemps[static_cast(WasmValueType::Count)] = {}; + Temporary* m_unusedTempNodes = nullptr; bool m_anyChanges = false; public: @@ -511,11 +511,7 @@ void Lowering::AfterLowerBlocks() JITDUMP(FMT_BB ": %s\n", block->bbNum, m_anyChanges ? "stackified with some changes" : "already in WASM value stack order"); - - for (WasmValueType type = WasmValueType::First; type < WasmValueType::Count; ++type) - { - assert((m_unusedTemps[static_cast(type)] == nullptr) && "Some temporaries were not released"); - } + assert((m_unusedTempNodes == nullptr) && "Some temporaries were not released"); } GenTree* StackifyTree(GenTree* root) @@ -636,12 +632,11 @@ void Lowering::AfterLowerBlocks() assert(varTypeIsEnregisterable(type)); unsigned lclNum; - unsigned index = static_cast(ActualTypeToWasmValueType(type)); - Temporary* local = Remove(&m_availableTemps[index]); + Temporary* local = Remove(&m_availableTemps[static_cast(ActualTypeToWasmValueType(type))]); if (local != nullptr) { lclNum = local->LclNum; - Append(&m_unusedTemps[index], local); + Append(&m_unusedTempNodes, local); // Free the node for later recycling. assert(m_compiler->lvaGetDesc(lclNum)->TypeGet() == genActualType(type)); } else @@ -657,7 +652,11 @@ void Lowering::AfterLowerBlocks() void ReleaseTemporariesDefinedBy(GenTree* node) { - assert(IsDataFlowRoot(node)); // We rely on the node not moving after this call. + // We rely in this function on the lifetime of temporaries beginning (recall this is backwards traversal) + // at exactly "node"'s position, and not shrinking or extending after this call. This is currently true + // because we never move dataflow roots, and we only begin processing them after all subsequent nodes + // have already been stackified and thus won't move either. + assert(IsDataFlowRoot(node)); if (!node->OperIs(GT_STORE_LCL_VAR)) { return; @@ -669,8 +668,7 @@ void Lowering::AfterLowerBlocks() return; } - unsigned index = static_cast(ActualTypeToWasmValueType(node->TypeGet())); - Temporary* local = Remove(&m_unusedTemps[index]); + Temporary* local = Remove(&m_unusedTempNodes); // See if we have any free nodes in the pool. if (local == nullptr) { local = new (m_compiler, CMK_Lower) Temporary(); @@ -678,7 +676,7 @@ void Lowering::AfterLowerBlocks() local->LclNum = lclNum; JITDUMP("Temporary V%02u is now free and can be re-used\n", lclNum); - Append(&m_availableTemps[index], local); + Append(&m_availableTemps[static_cast(ActualTypeToWasmValueType(node->TypeGet()))], local); } Temporary* Remove(Temporary** pTemps)