diff --git a/BepuPhysics/Trees/Tree_Add.cs b/BepuPhysics/Trees/Tree_Add.cs index e85bcb5e..6ea6fef7 100644 --- a/BepuPhysics/Trees/Tree_Add.cs +++ b/BepuPhysics/Trees/Tree_Add.cs @@ -8,7 +8,6 @@ namespace BepuPhysics.Trees; partial struct Tree { private struct InsertShouldNotRotate { } - private struct InsertShouldRotateTopDown { } private struct InsertShouldRotateBottomUp { } /// @@ -32,23 +31,10 @@ public int AddWithoutRefinement(BoundingBox bounds, BufferPool pool) /// Extents of the leaf bounds. /// Resource pool to use if resizing is required. /// Index of the leaf allocated in the tree's leaf array. - /// Performs incrementally refining tree rotations down along the insertion path, unlike . - /// For a given tree, this is slightly slower than and slightly faster than . - /// Trees built with repeated insertions of this kind tend to have decent quality, but slightly worse than . + /// Performs incrementally refining tree rotations when returning along the insertion path, unlike . + /// This is about twice the cost of (outside of pathological cases). + /// Trees built with repeated insertions of this kind tend to have better quality and fewer pathological cases compared to . public int Add(BoundingBox bounds, BufferPool pool) - { - return Add(bounds, pool); - } - - /// - /// Adds a leaf to the tree with the given bounding box and returns the index of the added leaf. - /// - /// Extents of the leaf bounds. - /// Resource pool to use if resizing is required. - /// Index of the leaf allocated in the tree's leaf array. - /// Performs incrementally refining tree rotations up along the insertion path, unlike . - /// Trees built with repeated insertions of this kind tend to have slightly better quality than , but it is also slightly more expensive. - public int AddWithBottomUpRefinement(BoundingBox bounds, BufferPool pool) { return Add(bounds, pool); } @@ -83,18 +69,13 @@ private int Add(BoundingBox bounds, BufferPool pool) where TShoul var newLeafIndex = AddLeaf(newNodeIndex, 0); while (true) { - //Note: rotating from the top down produces a tree that's lower quality that rotating from the bottom up. - //In context, that's fine; insertion just needs to produce a tree that isn't megatrash/stackoverflowy, and refinement will take care of the rest. - //The advantage is that top down is a little faster. - if (typeof(TShouldRotate) == typeof(InsertShouldRotateTopDown)) - TryRotateNode(nodeIndex); ref var node = ref Nodes[nodeIndex]; - //Choose whichever child requires less bounds expansion. If they're tied, choose the one with the least leaf count. + // Choose whichever child increases the lost cost estimate less. If they're tied, choose the one with the least leaf count. BoundingBox.CreateMergedUnsafe(bounds4, node.A, out var mergedA); BoundingBox.CreateMergedUnsafe(bounds4, node.B, out var mergedB); - var boundsIncreaseA = ComputeBoundsMetric(mergedA) - ComputeBoundsMetric(Unsafe.As(ref node.A)); - var boundsIncreaseB = ComputeBoundsMetric(mergedB) - ComputeBoundsMetric(Unsafe.As(ref node.B)); - var useA = boundsIncreaseA == boundsIncreaseB ? node.A.LeafCount < node.B.LeafCount : boundsIncreaseA < boundsIncreaseB; + var costIncreaseA = ComputeBoundsMetric(mergedA) * (node.A.LeafCount + 1) - EstimateCost(node.A); + var costIncreaseB = ComputeBoundsMetric(mergedB) * (node.B.LeafCount + 1) - EstimateCost(node.B); + var useA = costIncreaseA == costIncreaseB ? node.A.LeafCount < node.B.LeafCount : costIncreaseA < costIncreaseB; ref var merged = ref Unsafe.As(ref useA ? ref mergedA : ref mergedB); ref var chosenChild = ref useA ? ref node.A : ref node.B; if (chosenChild.LeafCount == 1) @@ -146,8 +127,9 @@ private int Add(BoundingBox bounds, BufferPool pool) where TShoul private void TryRotateNode(int rotationRootIndex) { ref var root = ref Nodes[rotationRootIndex]; - var costA = ComputeBoundsMetric(Unsafe.As(ref root.A)); - var costB = ComputeBoundsMetric(Unsafe.As(ref root.B)); + var costA = EstimateCost(root.A); + var costB = EstimateCost(root.B); + var originalCost = costA + costB; float leftRotationCostChange = 0; bool leftUsesA = false; float rightRotationCostChange = 0; @@ -158,21 +140,25 @@ private void TryRotateNode(int rotationRootIndex) ref var a = ref Nodes[root.A.Index]; BoundingBox.CreateMergedUnsafe(a.A, root.B, out var aaB); BoundingBox.CreateMergedUnsafe(a.B, root.B, out var abB); - var costAAB = ComputeBoundsMetric(Unsafe.As(ref aaB)); - var costABB = ComputeBoundsMetric(Unsafe.As(ref abB)); + var costAA = EstimateCost(a.A); + var costAB = EstimateCost(a.B); + var costAAB = ComputeBoundsMetric(Unsafe.As(ref aaB)) * (a.A.LeafCount + root.B.LeafCount) + costAB; + var costABB = ComputeBoundsMetric(Unsafe.As(ref abB)) * (a.B.LeafCount + root.B.LeafCount) + costAA; rightUsesA = costAAB < costABB; - rightRotationCostChange = float.Min(costAAB, costABB) - costA; + rightRotationCostChange = float.Min(costAAB, costABB) - originalCost; } if (root.B.Index >= 0) { //Try a left rotation. root.A will merge with the better of B's children, while the worse of B's children will take the place of root.B. ref var b = ref Nodes[root.B.Index]; - BoundingBox.CreateMergedUnsafe(root.A, b.A, out var baB); - BoundingBox.CreateMergedUnsafe(root.A, b.B, out var bbB); - var costBAB = ComputeBoundsMetric(Unsafe.As(ref baB)); - var costBBB = ComputeBoundsMetric(Unsafe.As(ref bbB)); - leftUsesA = costBAB < costBBB; - leftRotationCostChange = float.Min(costBAB, costBBB) - costB; + BoundingBox.CreateMergedUnsafe(b.A, root.A, out var baA); + BoundingBox.CreateMergedUnsafe(b.B, root.A, out var bbA); + var costBA = EstimateCost(b.A); + var costBB = EstimateCost(b.B); + var costBAA = ComputeBoundsMetric(Unsafe.As(ref baA)) * (b.A.LeafCount + root.A.LeafCount) + costBB; + var costBBA = ComputeBoundsMetric(Unsafe.As(ref bbA)) * (b.B.LeafCount + root.A.LeafCount) + costBA; + leftUsesA = costBAA < costBBA; + leftRotationCostChange = float.Min(costBAA, costBBA) - originalCost; } if (float.Min(leftRotationCostChange, rightRotationCostChange) < 0) { diff --git a/BepuPhysics/Trees/Tree_BinnedBuilder.cs b/BepuPhysics/Trees/Tree_BinnedBuilder.cs index 28636150..c72e514c 100644 --- a/BepuPhysics/Trees/Tree_BinnedBuilder.cs +++ b/BepuPhysics/Trees/Tree_BinnedBuilder.cs @@ -11,1586 +11,1590 @@ using System.Runtime.Intrinsics.X86; using System.Threading; -namespace BepuPhysics.Trees +namespace BepuPhysics.Trees; +partial struct Tree { - partial struct Tree + struct LeavesHandledInPostPass { } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static void BuildNode( + BoundingBox4 a, BoundingBox4 b, + int leafCountA, int leafCountB, + Buffer subtrees, Buffer nodes, Buffer metanodes, + int nodeIndex, int parentNodeIndex, int childIndexInParent, int subtreeCountA, int subtreeCountB, ref TLeaves leaves, out int aIndex, out int bIndex) + where TLeaves : unmanaged { - struct LeavesHandledInPostPass { } - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static void BuildNode( - BoundingBox4 a, BoundingBox4 b, - int leafCountA, int leafCountB, - Buffer subtrees, Buffer nodes, Buffer metanodes, - int nodeIndex, int parentNodeIndex, int childIndexInParent, int subtreeCountA, int subtreeCountB, ref TLeaves leaves, out int aIndex, out int bIndex) - where TLeaves : unmanaged + Debug.Assert(typeof(TLeaves) == typeof(LeavesHandledInPostPass) || typeof(TLeaves) == typeof(Buffer), "While we didn't bother with an interface here, we assume one of two types only."); + if (metanodes.Allocated) { - Debug.Assert(typeof(TLeaves) == typeof(LeavesHandledInPostPass) || typeof(TLeaves) == typeof(Buffer), "While we didn't bother with an interface here, we assume one of two types only."); - if (metanodes.Allocated) - { - //Note that we touching the metanodes buffer is *conditional*. There won't be any metanodes in the refinement use case, for example, because it has to be handled in a postpass. - ref var metanode = ref metanodes[0]; - metanode.Parent = parentNodeIndex; - metanode.IndexInParent = childIndexInParent; - metanode.RefineFlag = 0; - } - ref var node = ref nodes[0]; - node.A = Unsafe.As(ref a); - node.B = Unsafe.As(ref b); - node.A.LeafCount = leafCountA; - node.B.LeafCount = leafCountB; - if (subtreeCountA == 1) - { - aIndex = subtrees[0].Index; - if (typeof(TLeaves) == typeof(Buffer)) - { - Debug.Assert(leafCountA == 1); - Debug.Assert(aIndex < 0, "During building, any subtreeCount of 1 should imply a leaf."); - //This is a leaf node, and this is a direct builder execution, so write to the leaf data. - Unsafe.As>(ref leaves)[Encode(aIndex)] = new Leaf(nodeIndex, 0); - } - } - else - { - aIndex = nodeIndex + 1; - } - node.A.Index = aIndex; - if (subtreeCountB == 1) + //Note that we touching the metanodes buffer is *conditional*. There won't be any metanodes in the refinement use case, for example, because it has to be handled in a postpass. + ref var metanode = ref metanodes[0]; + metanode.Parent = parentNodeIndex; + metanode.IndexInParent = childIndexInParent; + metanode.RefineFlag = 0; + } + ref var node = ref nodes[0]; + node.A = Unsafe.As(ref a); + node.B = Unsafe.As(ref b); + node.A.LeafCount = leafCountA; + node.B.LeafCount = leafCountB; + if (subtreeCountA == 1) + { + aIndex = subtrees[0].Index; + if (typeof(TLeaves) == typeof(Buffer)) { - bIndex = subtrees[^1].Index; - if (typeof(TLeaves) == typeof(Buffer)) - { - Debug.Assert(leafCountB == 1); - Debug.Assert(bIndex < 0, "During building, any subtreeCount of 1 should imply a leaf."); - //This is a leaf node, and this is a direct builder execution, so write to the leaf data. - Unsafe.As>(ref leaves)[Encode(bIndex)] = new Leaf(nodeIndex, 1); - } + Debug.Assert(leafCountA == 1); + Debug.Assert(aIndex < 0, "During building, any subtreeCount of 1 should imply a leaf."); + //This is a leaf node, and this is a direct builder execution, so write to the leaf data. + Unsafe.As>(ref leaves)[Encode(aIndex)] = new Leaf(nodeIndex, 0); } - else + } + else + { + aIndex = nodeIndex + 1; + } + node.A.Index = aIndex; + if (subtreeCountB == 1) + { + bIndex = subtrees[^1].Index; + if (typeof(TLeaves) == typeof(Buffer)) { - bIndex = nodeIndex + subtreeCountA; //parentNodeIndex + 1 + (subtreeCountA - 1) + Debug.Assert(leafCountB == 1); + Debug.Assert(bIndex < 0, "During building, any subtreeCount of 1 should imply a leaf."); + //This is a leaf node, and this is a direct builder execution, so write to the leaf data. + Unsafe.As>(ref leaves)[Encode(bIndex)] = new Leaf(nodeIndex, 1); } - node.B.Index = bIndex; } - - internal static float ComputeBoundsMetric(BoundingBox4 bounds) => ComputeBoundsMetric(bounds.Min, bounds.Max); - internal static float ComputeBoundsMetric(Vector4 min, Vector4 max) + else { - //Note that we just use the SAH. While we are primarily interested in volume queries for the purposes of collision detection, the topological difference - //between a volume heuristic and surface area heuristic isn't huge. There is, however, one big annoying issue that volume heuristics run into: - //all bounding boxes with one extent equal to zero have zero cost. Surface area approaches avoid this hole simply. - var offset = max - min; - //Note that this is merely proportional to surface area. Being scaled by a constant factor is irrelevant. - return offset.X * offset.Y + offset.Y * offset.Z + offset.Z * offset.X; - + bIndex = nodeIndex + subtreeCountA; //parentNodeIndex + 1 + (subtreeCountA - 1) } + node.B.Index = bIndex; + } + + /// + /// Computes a local cost estimate for a node child using its bounds and leaf count. + /// Handy for + /// + /// Child to estimate the cost of. + /// Estimated cost of the child. + internal static float EstimateCost(NodeChild child) => ComputeBoundsMetric(Unsafe.As(ref child)) * child.LeafCount; + internal static float ComputeBoundsMetric(BoundingBox4 bounds) => ComputeBoundsMetric(bounds.Min, bounds.Max); + internal static float ComputeBoundsMetric(Vector4 min, Vector4 max) + { + //Note that we just use the SAH. While we are primarily interested in volume queries for the purposes of collision detection, the topological difference + //between a volume heuristic and surface area heuristic isn't huge. There is, however, one big annoying issue that volume heuristics run into: + //all bounding boxes with one extent equal to zero have zero cost. Surface area approaches avoid this hole simply. + var offset = max - min; + //Note that this is merely proportional to surface area. Being scaled by a constant factor is irrelevant. + return offset.X * offset.Y + offset.Y * offset.Z + offset.Z * offset.X; + } - interface IBinnedBuilderThreading - { - void GetBins(int workerIndex, - out Buffer binBoundingBoxes, out Buffer binCentroidBoundingBoxes, - out Buffer binBoundingBoxesScan, out Buffer binCentroidBoundingBoxesScan, out Buffer binLeafCounts); - } + interface IBinnedBuilderThreading + { + void GetBins(int workerIndex, + out Buffer binBoundingBoxes, out Buffer binCentroidBoundingBoxes, + out Buffer binBoundingBoxesScan, out Buffer binCentroidBoundingBoxesScan, out Buffer binLeafCounts); + } - struct Context - where TLeaves : unmanaged - where TThreading : unmanaged, IBinnedBuilderThreading - { - public int MinimumBinCount; - public int MaximumBinCount; - public float LeafToBinMultiplier; - public int MicrosweepThreshold; + struct Context + where TLeaves : unmanaged + where TThreading : unmanaged, IBinnedBuilderThreading + { + public int MinimumBinCount; + public int MaximumBinCount; + public float LeafToBinMultiplier; + public int MicrosweepThreshold; - public bool Deterministic; + public bool Deterministic; - public TLeaves Leaves; - public Buffer SubtreesPing; - public Buffer SubtreesPong; - public Buffer Nodes; - public Buffer Metanodes; + public TLeaves Leaves; + public Buffer SubtreesPing; + public Buffer SubtreesPong; + public Buffer Nodes; + public Buffer Metanodes; - public Buffer BinIndices; + public Buffer BinIndices; - public TThreading Threading; + public TThreading Threading; - public Context(int minimumBinCount, int maximumBinCount, float leafToBinMultiplier, int microsweepThreshold, bool deterministic, - Buffer subtreesPing, Buffer subtreesPong, TLeaves leaves, Buffer nodes, Buffer metanodes, Buffer binIndices, TThreading threading) - { - MinimumBinCount = minimumBinCount; - MaximumBinCount = maximumBinCount; - LeafToBinMultiplier = leafToBinMultiplier; - MicrosweepThreshold = microsweepThreshold; - Deterministic = deterministic; - SubtreesPing = subtreesPing; - SubtreesPong = subtreesPong; - BinIndices = binIndices; - Leaves = leaves; - Nodes = nodes; - Metanodes = metanodes; - Threading = threading; - } + public Context(int minimumBinCount, int maximumBinCount, float leafToBinMultiplier, int microsweepThreshold, bool deterministic, + Buffer subtreesPing, Buffer subtreesPong, TLeaves leaves, Buffer nodes, Buffer metanodes, Buffer binIndices, TThreading threading) + { + MinimumBinCount = minimumBinCount; + MaximumBinCount = maximumBinCount; + LeafToBinMultiplier = leafToBinMultiplier; + MicrosweepThreshold = microsweepThreshold; + Deterministic = deterministic; + SubtreesPing = subtreesPing; + SubtreesPong = subtreesPong; + BinIndices = binIndices; + Leaves = leaves; + Nodes = nodes; + Metanodes = metanodes; + Threading = threading; } + } - struct BoundsComparerX : IComparerRef { public int Compare(ref NodeChild a, ref NodeChild b) => (a.Min.X + a.Max.X) > (b.Min.X + b.Max.X) ? -1 : 1; } - struct BoundsComparerY : IComparerRef { public int Compare(ref NodeChild a, ref NodeChild b) => (a.Min.Y + a.Max.Y) > (b.Min.Y + b.Max.Y) ? -1 : 1; } - struct BoundsComparerZ : IComparerRef { public int Compare(ref NodeChild a, ref NodeChild b) => (a.Min.Z + a.Max.Z) > (b.Min.Z + b.Max.Z) ? -1 : 1; } + struct BoundsComparerX : IComparerRef { public int Compare(ref NodeChild a, ref NodeChild b) => (a.Min.X + a.Max.X) > (b.Min.X + b.Max.X) ? -1 : 1; } + struct BoundsComparerY : IComparerRef { public int Compare(ref NodeChild a, ref NodeChild b) => (a.Min.Y + a.Max.Y) > (b.Min.Y + b.Max.Y) ? -1 : 1; } + struct BoundsComparerZ : IComparerRef { public int Compare(ref NodeChild a, ref NodeChild b) => (a.Min.Z + a.Max.Z) > (b.Min.Z + b.Max.Z) ? -1 : 1; } - public struct NodeTimes - { - public double Total; - public double CentroidPrepass; - public double Binning; - public double Partition; - public bool MTPrepass; - public bool MTBinning; - public bool MTPartition; - public int TargetTaskCount; - public int SubtreeCount; - } + public struct NodeTimes + { + public double Total; + public double CentroidPrepass; + public double Binning; + public double Partition; + public bool MTPrepass; + public bool MTBinning; + public bool MTPartition; + public int TargetTaskCount; + public int SubtreeCount; + } - public static NodeTimes[] Times; + public static NodeTimes[] Times; - static unsafe void MicroSweepForBinnedBuilder( - Vector4 centroidMin, Vector4 centroidMax, ref TLeaves leaves, - Buffer subtrees, Buffer nodes, Buffer metanodes, int nodeIndex, int parentNodeIndex, int childIndexInParent, Context* context, int workerIndex) - where TLeaves : unmanaged where TThreading : unmanaged, IBinnedBuilderThreading + static unsafe void MicroSweepForBinnedBuilder( + Vector4 centroidMin, Vector4 centroidMax, ref TLeaves leaves, + Buffer subtrees, Buffer nodes, Buffer metanodes, int nodeIndex, int parentNodeIndex, int childIndexInParent, Context* context, int workerIndex) + where TLeaves : unmanaged where TThreading : unmanaged, IBinnedBuilderThreading + { + //This is a very small scale sweep build. + var subtreeCount = subtrees.Length; + if (subtreeCount == 2) { - //This is a very small scale sweep build. - var subtreeCount = subtrees.Length; - if (subtreeCount == 2) - { - ref var subtreeA = ref subtrees[0]; - ref var subtreeB = ref subtrees[1]; - Debug.Assert(parentNodeIndex < 0 || Unsafe.Add(ref context->Nodes[parentNodeIndex].A, childIndexInParent).LeafCount == subtreeA.LeafCount + subtreeB.LeafCount); - BuildNode(Unsafe.As(ref subtreeA), Unsafe.As(ref subtreeB), subtreeA.LeafCount, subtreeB.LeafCount, subtrees, - nodes, metanodes, nodeIndex, parentNodeIndex, childIndexInParent, 1, 1, ref leaves, out _, out _); - return; - } - var centroidSpan = centroidMax - centroidMin; - var axisIsDegenerate = Vector128.LessThanOrEqual(centroidSpan.AsVector128(), Vector128.Create(1e-12f)); - if ((Vector128.ExtractMostSignificantBits(axisIsDegenerate) & 0b111) == 0b111) - { - //Looks like all the centroids are in the same spot; there's no meaningful way to split this. - HandleMicrosweepDegeneracy(ref leaves, subtrees, nodes, metanodes, nodeIndex, parentNodeIndex, childIndexInParent, centroidMin, centroidMax, context, workerIndex); - return; - } + ref var subtreeA = ref subtrees[0]; + ref var subtreeB = ref subtrees[1]; + Debug.Assert(parentNodeIndex < 0 || Unsafe.Add(ref context->Nodes[parentNodeIndex].A, childIndexInParent).LeafCount == subtreeA.LeafCount + subtreeB.LeafCount); + BuildNode(Unsafe.As(ref subtreeA), Unsafe.As(ref subtreeB), subtreeA.LeafCount, subtreeB.LeafCount, subtrees, + nodes, metanodes, nodeIndex, parentNodeIndex, childIndexInParent, 1, 1, ref leaves, out _, out _); + return; + } + var centroidSpan = centroidMax - centroidMin; + var axisIsDegenerate = Vector128.LessThanOrEqual(centroidSpan.AsVector128(), Vector128.Create(1e-12f)); + if ((Vector128.ExtractMostSignificantBits(axisIsDegenerate) & 0b111) == 0b111) + { + //Looks like all the centroids are in the same spot; there's no meaningful way to split this. + HandleMicrosweepDegeneracy(ref leaves, subtrees, nodes, metanodes, nodeIndex, parentNodeIndex, childIndexInParent, centroidMin, centroidMax, context, workerIndex); + return; + } - context->Threading.GetBins(workerIndex, out var binBoundingBoxes, out var binCentroidBoundingBoxes, out var binBoundingBoxesScan, out var binCentroidBoundingBoxesScan, out var binLeafCounts); + context->Threading.GetBins(workerIndex, out var binBoundingBoxes, out var binCentroidBoundingBoxes, out var binBoundingBoxesScan, out var binCentroidBoundingBoxesScan, out var binLeafCounts); - if (Vector256.IsHardwareAccelerated || Vector128.IsHardwareAccelerated) - { - //Repurpose the bins memory so we don't need to allocate any extra. The bins aren't in use right now anyway. - int paddedKeyCount = Vector256.IsHardwareAccelerated ? ((subtreeCount + 7) / 8) * 8 : ((subtreeCount + 3) / 4) * 4; + if (Vector256.IsHardwareAccelerated || Vector128.IsHardwareAccelerated) + { + //Repurpose the bins memory so we don't need to allocate any extra. The bins aren't in use right now anyway. + int paddedKeyCount = Vector256.IsHardwareAccelerated ? ((subtreeCount + 7) / 8) * 8 : ((subtreeCount + 3) / 4) * 4; - Debug.Assert(Unsafe.SizeOf() * binBoundingBoxes.Length >= (paddedKeyCount * 2 + subtreeCount) * Unsafe.SizeOf(), - "The bins should preallocate enough space to handle the needs of microsweeps. They reuse the same allocations."); - var keys = new Buffer(binBoundingBoxes.Memory, paddedKeyCount); - var targetIndices = new Buffer(keys.Memory + paddedKeyCount, paddedKeyCount); + Debug.Assert(Unsafe.SizeOf() * binBoundingBoxes.Length >= (paddedKeyCount * 2 + subtreeCount) * Unsafe.SizeOf(), + "The bins should preallocate enough space to handle the needs of microsweeps. They reuse the same allocations."); + var keys = new Buffer(binBoundingBoxes.Memory, paddedKeyCount); + var targetIndices = new Buffer(keys.Memory + paddedKeyCount, paddedKeyCount); - //Compute the axis centroids up front to avoid having to recompute them during a sort. - if (centroidSpan.X > centroidSpan.Y && centroidSpan.X > centroidSpan.Z) - { - for (int i = 0; i < subtreeCount; ++i) - { - ref var bounds = ref subtrees[i]; - keys[i] = bounds.Min.X + bounds.Max.X; - } - } - else if (centroidSpan.Y > centroidSpan.Z) - { - for (int i = 0; i < subtreeCount; ++i) - { - ref var bounds = ref subtrees[i]; - keys[i] = bounds.Min.Y + bounds.Max.Y; - } - } - else - { - for (int i = 0; i < subtreeCount; ++i) - { - ref var bounds = ref subtrees[i]; - keys[i] = bounds.Min.Z + bounds.Max.Z; - } - } - for (int i = subtreeCount; i < paddedKeyCount; ++i) + //Compute the axis centroids up front to avoid having to recompute them during a sort. + if (centroidSpan.X > centroidSpan.Y && centroidSpan.X > centroidSpan.Z) + { + for (int i = 0; i < subtreeCount; ++i) { - keys[i] = float.MaxValue; + ref var bounds = ref subtrees[i]; + keys[i] = bounds.Min.X + bounds.Max.X; } - VectorizedSorts.VectorCountingSort(keys, targetIndices, subtreeCount); - - //Now that we know the target indices, copy things into position. - //Have to copy things into a temporary cache to avoid overwrites since we didn't do any shuffling during the sort. - //Note that we can now reuse the keys memory. - var subtreeCache = binBoundingBoxesScan.As(); - subtrees.CopyTo(0, subtreeCache, 0, subtreeCount); + } + else if (centroidSpan.Y > centroidSpan.Z) + { for (int i = 0; i < subtreeCount; ++i) { - var targetIndex = targetIndices[i]; - subtrees[targetIndex] = subtreeCache[i]; + ref var bounds = ref subtrees[i]; + keys[i] = bounds.Min.Y + bounds.Max.Y; } } else { - //No vectorization supported. Fall back to poopymode! - if (centroidSpan.X > centroidSpan.Y && centroidSpan.X > centroidSpan.Z) - { - var comparer = new BoundsComparerX(); - QuickSort.Sort(ref subtrees[0], 0, subtreeCount - 1, ref comparer); - } - else if (centroidSpan.Y > centroidSpan.Z) - { - var comparer = new BoundsComparerY(); - QuickSort.Sort(ref subtrees[0], 0, subtreeCount - 1, ref comparer); - } - else + for (int i = 0; i < subtreeCount; ++i) { - var comparer = new BoundsComparerZ(); - QuickSort.Sort(ref subtrees[0], 0, subtreeCount - 1, ref comparer); + ref var bounds = ref subtrees[i]; + keys[i] = bounds.Min.Z + bounds.Max.Z; } } - - Debug.Assert(subtreeCount <= context->MaximumBinCount || subtreeCount <= context->MicrosweepThreshold, "We're reusing the bin resources under the assumption that this is only ever called when there are less leaves than maximum bins."); - //Identify the split index by examining the SAH of very split option. - //Premerge from left to right so we have a sorta-summed area table to cheaply look up all possible child A bounds as we scan. - var boundingBoxes = subtrees.As(); - binBoundingBoxesScan[0] = boundingBoxes[0]; - int totalLeafCount = subtrees[0].LeafCount; - for (int i = 1; i < subtreeCount; ++i) + for (int i = subtreeCount; i < paddedKeyCount; ++i) { - var previousIndex = i - 1; - ref var previousScanBounds = ref binBoundingBoxesScan[previousIndex]; - ref var scanBounds = ref binBoundingBoxesScan[i]; - ref var bounds = ref boundingBoxes[i]; - scanBounds.Min = Vector4.Min(bounds.Min, previousScanBounds.Min); - scanBounds.Max = Vector4.Max(bounds.Max, previousScanBounds.Max); - totalLeafCount += subtrees[i].LeafCount; + keys[i] = float.MaxValue; } - - float bestSAH = float.MaxValue; - int bestSplit = 1; - //The split index is going to end up in child B. - var lastSubtreeIndex = subtreeCount - 1; - BoundingBox4 accumulatedBoundingBoxB = boundingBoxes[lastSubtreeIndex]; - Unsafe.SkipInit(out BoundingBox4 bestBoundsB); - int accumulatedLeafCountB = subtrees[lastSubtreeIndex].LeafCount; - int bestLeafCountB = 0; - for (int splitIndexCandidate = lastSubtreeIndex; splitIndexCandidate >= 1; --splitIndexCandidate) + VectorizedSorts.VectorCountingSort(keys, targetIndices, subtreeCount); + + //Now that we know the target indices, copy things into position. + //Have to copy things into a temporary cache to avoid overwrites since we didn't do any shuffling during the sort. + //Note that we can now reuse the keys memory. + var subtreeCache = binBoundingBoxesScan.As(); + subtrees.CopyTo(0, subtreeCache, 0, subtreeCount); + for (int i = 0; i < subtreeCount; ++i) { - var previousIndex = splitIndexCandidate - 1; - var sahCandidate = - ComputeBoundsMetric(binBoundingBoxesScan[previousIndex]) * (totalLeafCount - accumulatedLeafCountB) + - ComputeBoundsMetric(accumulatedBoundingBoxB) * accumulatedLeafCountB; - if (sahCandidate < bestSAH) - { - bestSAH = sahCandidate; - bestSplit = splitIndexCandidate; - bestBoundsB = accumulatedBoundingBoxB; - bestLeafCountB = accumulatedLeafCountB; - } - ref var bounds = ref boundingBoxes[previousIndex]; - accumulatedBoundingBoxB.Min = Vector4.Min(bounds.Min, accumulatedBoundingBoxB.Min); - accumulatedBoundingBoxB.Max = Vector4.Max(bounds.Max, accumulatedBoundingBoxB.Max); - accumulatedLeafCountB += subtrees[previousIndex].LeafCount; + var targetIndex = targetIndices[i]; + subtrees[targetIndex] = subtreeCache[i]; } - if (bestLeafCountB == 0 || bestLeafCountB == totalLeafCount || bestSAH == float.MaxValue || float.IsNaN(bestSAH) || float.IsInfinity(bestSAH)) + } + else + { + //No vectorization supported. Fall back to poopymode! + if (centroidSpan.X > centroidSpan.Y && centroidSpan.X > centroidSpan.Z) { - //Some form of major problem detected! Fall back to a degenerate split. - HandleMicrosweepDegeneracy(ref leaves, subtrees, nodes, metanodes, nodeIndex, parentNodeIndex, childIndexInParent, centroidMin, centroidMax, context, workerIndex); - return; + var comparer = new BoundsComparerX(); + QuickSort.Sort(ref subtrees[0], 0, subtreeCount - 1, ref comparer); } - - var bestBoundsA = binBoundingBoxesScan[bestSplit - 1]; - var subtreeCountA = bestSplit; - var subtreeCountB = subtreeCount - bestSplit; - var bestLeafCountA = totalLeafCount - bestLeafCountB; - - Debug.Assert(parentNodeIndex < 0 || Unsafe.Add(ref context->Nodes[parentNodeIndex].A, childIndexInParent).LeafCount == bestLeafCountA + bestLeafCountB); - BuildNode(bestBoundsA, bestBoundsB, bestLeafCountA, bestLeafCountB, subtrees, nodes, metanodes, nodeIndex, parentNodeIndex, childIndexInParent, subtreeCountA, subtreeCountB, ref leaves, out var aIndex, out var bIndex); - if (subtreeCountA > 1) + else if (centroidSpan.Y > centroidSpan.Z) { - var aBounds = boundingBoxes.Slice(subtreeCountA); - var initialCentroid = aBounds.Memory->Min + aBounds.Memory->Max; - BoundingBox4 centroidBoundsA; - centroidBoundsA.Min = initialCentroid; - centroidBoundsA.Max = initialCentroid; - for (int i = 1; i < subtreeCountA; ++i) - { - ref var bounds = ref aBounds[i]; - var centroid = bounds.Min + bounds.Max; - centroidBoundsA.Min = Vector4.Min(centroidBoundsA.Min, centroid); - centroidBoundsA.Max = Vector4.Max(centroidBoundsA.Max, centroid); - } - MicroSweepForBinnedBuilder(centroidBoundsA.Min, centroidBoundsA.Max, ref leaves, subtrees.Slice(subtreeCountA), nodes.Slice(1, subtreeCountA - 1), metanodes.Allocated ? metanodes.Slice(1, subtreeCountA - 1) : metanodes, aIndex, nodeIndex, 0, context, workerIndex); + var comparer = new BoundsComparerY(); + QuickSort.Sort(ref subtrees[0], 0, subtreeCount - 1, ref comparer); } - if (subtreeCountB > 1) + else { - var bBounds = boundingBoxes.Slice(subtreeCountA, subtreeCountB); - var initialCentroid = bBounds.Memory->Min + bBounds.Memory->Max; - BoundingBox4 centroidBoundsB; - centroidBoundsB.Min = initialCentroid; - centroidBoundsB.Max = initialCentroid; - for (int i = 1; i < subtreeCountB; ++i) - { - ref var bounds = ref bBounds[i]; - var centroid = bounds.Min + bounds.Max; - centroidBoundsB.Min = Vector4.Min(centroidBoundsB.Min, centroid); - centroidBoundsB.Max = Vector4.Max(centroidBoundsB.Max, centroid); - } - MicroSweepForBinnedBuilder(centroidBoundsB.Min, centroidBoundsB.Max, ref leaves, subtrees.Slice(subtreeCountA, subtreeCountB), nodes.Slice(subtreeCountA, subtreeCountB - 1), metanodes.Allocated ? metanodes.Slice(subtreeCountA, subtreeCountB - 1) : metanodes, bIndex, nodeIndex, 1, context, workerIndex); + var comparer = new BoundsComparerZ(); + QuickSort.Sort(ref subtrees[0], 0, subtreeCount - 1, ref comparer); } } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static int ComputeBinIndex(Vector4 centroidMin, bool useX, bool useY, Vector128 permuteMask, int axisIndex, Vector4 offsetToBinIndex, Vector4 maximumBinIndex, in BoundingBox4 box) + Debug.Assert(subtreeCount <= context->MaximumBinCount || subtreeCount <= context->MicrosweepThreshold, "We're reusing the bin resources under the assumption that this is only ever called when there are less leaves than maximum bins."); + //Identify the split index by examining the SAH of very split option. + //Premerge from left to right so we have a sorta-summed area table to cheaply look up all possible child A bounds as we scan. + var boundingBoxes = subtrees.As(); + binBoundingBoxesScan[0] = boundingBoxes[0]; + int totalLeafCount = subtrees[0].LeafCount; + for (int i = 1; i < subtreeCount; ++i) { - var centroid = box.Min + box.Max; - //Note the clamp against zero as well as maximumBinIndex; going negative *can* happen when the bounding box is corrupted. We'd rather not crash with an access violation. - var binIndicesForLeafContinuous = Vector4.Clamp((centroid - centroidMin) * offsetToBinIndex, Vector4.Zero, maximumBinIndex); - //Note that we don't store out any of the indices into per-bin lists here. We only *really* want two final groups for the children, - //and we can easily compute those by performing another scan. It requires recomputing the bin indices, but that's really not much of a concern. - //To extract the desired lane, we need to use a variable shuffle mask. At the time of writing, the Vector128 cross platform shuffle did not like variable masks. - if (Avx.IsSupported) - return (int)Vector128.ToScalar(Avx.PermuteVar(binIndicesForLeafContinuous.AsVector128(), permuteMask)); - else if (Vector128.IsHardwareAccelerated) - return (int)Vector128.GetElement(binIndicesForLeafContinuous.AsVector128(), axisIndex); - else - return (int)(useX ? binIndicesForLeafContinuous.X : useY ? binIndicesForLeafContinuous.Y : binIndicesForLeafContinuous.Z); + var previousIndex = i - 1; + ref var previousScanBounds = ref binBoundingBoxesScan[previousIndex]; + ref var scanBounds = ref binBoundingBoxesScan[i]; + ref var bounds = ref boundingBoxes[i]; + scanBounds.Min = Vector4.Min(bounds.Min, previousScanBounds.Min); + scanBounds.Max = Vector4.Max(bounds.Max, previousScanBounds.Max); + totalLeafCount += subtrees[i].LeafCount; } - struct SingleThreaded : IBinnedBuilderThreading + float bestSAH = float.MaxValue; + int bestSplit = 1; + //The split index is going to end up in child B. + var lastSubtreeIndex = subtreeCount - 1; + BoundingBox4 accumulatedBoundingBoxB = boundingBoxes[lastSubtreeIndex]; + Unsafe.SkipInit(out BoundingBox4 bestBoundsB); + int accumulatedLeafCountB = subtrees[lastSubtreeIndex].LeafCount; + int bestLeafCountB = 0; + for (int splitIndexCandidate = lastSubtreeIndex; splitIndexCandidate >= 1; --splitIndexCandidate) { - public Buffer BinBoundingBoxes; - public Buffer BinCentroidBoundingBoxes; - public Buffer BinBoundingBoxesScan; - public Buffer BinCentroidBoundingBoxesScan; - public Buffer BinLeafCounts; - - public SingleThreaded(Buffer binAllocationBuffer, int binCapacity) - { - int start = 0; - BinBoundingBoxes = Suballocate(binAllocationBuffer, ref start, binCapacity); - BinCentroidBoundingBoxes = Suballocate(binAllocationBuffer, ref start, binCapacity); - BinBoundingBoxesScan = Suballocate(binAllocationBuffer, ref start, binCapacity); - BinCentroidBoundingBoxesScan = Suballocate(binAllocationBuffer, ref start, binCapacity); - BinLeafCounts = Suballocate(binAllocationBuffer, ref start, binCapacity); - } - - public void GetBins(int workerIndex, - out Buffer binBoundingBoxes, out Buffer binCentroidBoundingBoxes, - out Buffer binBoundingBoxesScan, out Buffer binCentroidBoundingBoxesScan, out Buffer binLeafCounts) + var previousIndex = splitIndexCandidate - 1; + var sahCandidate = + ComputeBoundsMetric(binBoundingBoxesScan[previousIndex]) * (totalLeafCount - accumulatedLeafCountB) + + ComputeBoundsMetric(accumulatedBoundingBoxB) * accumulatedLeafCountB; + if (sahCandidate < bestSAH) { - binBoundingBoxes = BinBoundingBoxes; - binCentroidBoundingBoxes = BinCentroidBoundingBoxes; - binBoundingBoxesScan = BinBoundingBoxesScan; - binCentroidBoundingBoxesScan = BinCentroidBoundingBoxesScan; - binLeafCounts = BinLeafCounts; + bestSAH = sahCandidate; + bestSplit = splitIndexCandidate; + bestBoundsB = accumulatedBoundingBoxB; + bestLeafCountB = accumulatedLeafCountB; } + ref var bounds = ref boundingBoxes[previousIndex]; + accumulatedBoundingBoxB.Min = Vector4.Min(bounds.Min, accumulatedBoundingBoxB.Min); + accumulatedBoundingBoxB.Max = Vector4.Max(bounds.Max, accumulatedBoundingBoxB.Max); + accumulatedLeafCountB += subtrees[previousIndex].LeafCount; } - - static Buffer Suballocate(Buffer buffer, ref int start, int count) where T : unmanaged + if (bestLeafCountB == 0 || bestLeafCountB == totalLeafCount || bestSAH == float.MaxValue || float.IsNaN(bestSAH) || float.IsInfinity(bestSAH)) { - var size = count * Unsafe.SizeOf(); - var previousStart = start; - start += size; - return buffer.Slice(previousStart, size).As(); + //Some form of major problem detected! Fall back to a degenerate split. + HandleMicrosweepDegeneracy(ref leaves, subtrees, nodes, metanodes, nodeIndex, parentNodeIndex, childIndexInParent, centroidMin, centroidMax, context, workerIndex); + return; } - /// - /// Stores resources required by a worker to dispatch and manage multithreaded work. - /// - /// - /// Some of the resources cached here are technically redundant with the storage used for workers and ends up involving an extra bin scan on a multithreaded test, - /// but the cost associated with doing so is... low. The complexity cost of trying to use the memory allocated for workers is not low. - /// - struct BinnedBuildWorkerContext + var bestBoundsA = binBoundingBoxesScan[bestSplit - 1]; + var subtreeCountA = bestSplit; + var subtreeCountB = subtreeCount - bestSplit; + var bestLeafCountA = totalLeafCount - bestLeafCountB; + + Debug.Assert(parentNodeIndex < 0 || Unsafe.Add(ref context->Nodes[parentNodeIndex].A, childIndexInParent).LeafCount == bestLeafCountA + bestLeafCountB); + BuildNode(bestBoundsA, bestBoundsB, bestLeafCountA, bestLeafCountB, subtrees, nodes, metanodes, nodeIndex, parentNodeIndex, childIndexInParent, subtreeCountA, subtreeCountB, ref leaves, out var aIndex, out var bIndex); + if (subtreeCountA > 1) { - /// - /// Bins associated with this worker for the duration of a node. This allocation will persist across the build. - /// - public Buffer BinBoundingBoxes; - /// - /// Centroid bound bins associated with this worker for the duration of a node. This allocation will persist across the build. - /// - public Buffer BinCentroidBoundingBoxes; - /// - /// Bins associated with this worker for use in the SAH scan. This allocation will persist across the build. - /// - public Buffer BinBoundingBoxesScan; - /// - /// Centroid bound bins associated with this worker for use in the SAH scan. This allocation will persist across the build. - /// - public Buffer BinCentroidBoundingBoxesScan; - /// - /// Bin leaf counts associated with this worker for the duration of a node. This allocation will persist across the build. - /// - public Buffer BinLeafCounts; - - public BinnedBuildWorkerContext(Buffer binAllocationBuffer, ref int binStart, int binCapacity) + var aBounds = boundingBoxes.Slice(subtreeCountA); + var initialCentroid = aBounds.Memory->Min + aBounds.Memory->Max; + BoundingBox4 centroidBoundsA; + centroidBoundsA.Min = initialCentroid; + centroidBoundsA.Max = initialCentroid; + for (int i = 1; i < subtreeCountA; ++i) { - BinBoundingBoxes = Suballocate(binAllocationBuffer, ref binStart, binCapacity); - BinCentroidBoundingBoxes = Suballocate(binAllocationBuffer, ref binStart, binCapacity); - BinBoundingBoxesScan = Suballocate(binAllocationBuffer, ref binStart, binCapacity); - BinCentroidBoundingBoxesScan = Suballocate(binAllocationBuffer, ref binStart, binCapacity); - BinLeafCounts = Suballocate(binAllocationBuffer, ref binStart, binCapacity); + ref var bounds = ref aBounds[i]; + var centroid = bounds.Min + bounds.Max; + centroidBoundsA.Min = Vector4.Min(centroidBoundsA.Min, centroid); + centroidBoundsA.Max = Vector4.Max(centroidBoundsA.Max, centroid); } + MicroSweepForBinnedBuilder(centroidBoundsA.Min, centroidBoundsA.Max, ref leaves, subtrees.Slice(subtreeCountA), nodes.Slice(1, subtreeCountA - 1), metanodes.Allocated ? metanodes.Slice(1, subtreeCountA - 1) : metanodes, aIndex, nodeIndex, 0, context, workerIndex); } - unsafe struct MultithreadBinnedBuildContext : IBinnedBuilderThreading + if (subtreeCountB > 1) { - public TaskStack* TaskStack; - /// - /// The number of subtrees present at the root of the build. - /// - public int OriginalSubtreeCount; - /// - /// The target number of tasks that would be used for the root node. Later nodes will tend to target smaller numbers of tasks on the assumption that other parallel nodes will provide enough work to fill in the gaps. - /// - public int TopLevelTargetTaskCount; - public Buffer Workers; - - public void GetBins(int workerIndex, - out Buffer binBoundingBoxes, out Buffer binCentroidBoundingBoxes, - out Buffer binBoundingBoxesScan, out Buffer binCentroidBoundingBoxesScan, out Buffer binLeafCounts) + var bBounds = boundingBoxes.Slice(subtreeCountA, subtreeCountB); + var initialCentroid = bBounds.Memory->Min + bBounds.Memory->Max; + BoundingBox4 centroidBoundsB; + centroidBoundsB.Min = initialCentroid; + centroidBoundsB.Max = initialCentroid; + for (int i = 1; i < subtreeCountB; ++i) { - ref var worker = ref Workers[workerIndex]; - binBoundingBoxes = worker.BinBoundingBoxes; - binCentroidBoundingBoxes = worker.BinCentroidBoundingBoxes; - binBoundingBoxesScan = worker.BinBoundingBoxesScan; - binCentroidBoundingBoxesScan = worker.BinCentroidBoundingBoxesScan; - binLeafCounts = worker.BinLeafCounts; + ref var bounds = ref bBounds[i]; + var centroid = bounds.Min + bounds.Max; + centroidBoundsB.Min = Vector4.Min(centroidBoundsB.Min, centroid); + centroidBoundsB.Max = Vector4.Max(centroidBoundsB.Max, centroid); } + MicroSweepForBinnedBuilder(centroidBoundsB.Min, centroidBoundsB.Max, ref leaves, subtrees.Slice(subtreeCountA, subtreeCountB), nodes.Slice(subtreeCountA, subtreeCountB - 1), metanodes.Allocated ? metanodes.Slice(subtreeCountA, subtreeCountB - 1) : metanodes, bIndex, nodeIndex, 1, context, workerIndex); + } + } - public int GetTargetTaskCountForInnerLoop(int subtreeCount) - { - return (int)float.Ceiling(TopLevelTargetTaskCount * (float)subtreeCount / OriginalSubtreeCount); - } - public int GetTargetTaskCountForNodes(int subtreeCount) - { - return (int)float.Ceiling(TargetTaskCountMultiplierForNodePushOverInnerLoop * TopLevelTargetTaskCount * (float)subtreeCount / OriginalSubtreeCount); - } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int ComputeBinIndex(Vector4 centroidMin, bool useX, bool useY, Vector128 permuteMask, int axisIndex, Vector4 offsetToBinIndex, Vector4 maximumBinIndex, in BoundingBox4 box) + { + var centroid = box.Min + box.Max; + //Note the clamp against zero as well as maximumBinIndex; going negative *can* happen when the bounding box is corrupted. We'd rather not crash with an access violation. + var binIndicesForLeafContinuous = Vector4.Clamp((centroid - centroidMin) * offsetToBinIndex, Vector4.Zero, maximumBinIndex); + //Note that we don't store out any of the indices into per-bin lists here. We only *really* want two final groups for the children, + //and we can easily compute those by performing another scan. It requires recomputing the bin indices, but that's really not much of a concern. + //To extract the desired lane, we need to use a variable shuffle mask. At the time of writing, the Vector128 cross platform shuffle did not like variable masks. + if (Avx.IsSupported) + return (int)Vector128.ToScalar(Avx.PermuteVar(binIndicesForLeafContinuous.AsVector128(), permuteMask)); + else if (Vector128.IsHardwareAccelerated) + return (int)Vector128.GetElement(binIndicesForLeafContinuous.AsVector128(), axisIndex); + else + return (int)(useX ? binIndicesForLeafContinuous.X : useY ? binIndicesForLeafContinuous.Y : binIndicesForLeafContinuous.Z); + } + + struct SingleThreaded : IBinnedBuilderThreading + { + public Buffer BinBoundingBoxes; + public Buffer BinCentroidBoundingBoxes; + public Buffer BinBoundingBoxesScan; + public Buffer BinCentroidBoundingBoxesScan; + public Buffer BinLeafCounts; + + public SingleThreaded(Buffer binAllocationBuffer, int binCapacity) + { + int start = 0; + BinBoundingBoxes = Suballocate(binAllocationBuffer, ref start, binCapacity); + BinCentroidBoundingBoxes = Suballocate(binAllocationBuffer, ref start, binCapacity); + BinBoundingBoxesScan = Suballocate(binAllocationBuffer, ref start, binCapacity); + BinCentroidBoundingBoxesScan = Suballocate(binAllocationBuffer, ref start, binCapacity); + BinLeafCounts = Suballocate(binAllocationBuffer, ref start, binCapacity); } - const int MinimumSubtreesPerThreadForCentroidPrepass = 1024; - const int MinimumSubtreesPerThreadForBinning = 1024; - const int MinimumSubtreesPerThreadForPartitioning = 1024; - const int MinimumSubtreesPerThreadForNodeJob = 256; - const int TargetTaskCountMultiplierForNodePushOverInnerLoop = 8; + public void GetBins(int workerIndex, + out Buffer binBoundingBoxes, out Buffer binCentroidBoundingBoxes, + out Buffer binBoundingBoxesScan, out Buffer binCentroidBoundingBoxesScan, out Buffer binLeafCounts) + { + binBoundingBoxes = BinBoundingBoxes; + binCentroidBoundingBoxes = BinCentroidBoundingBoxes; + binBoundingBoxesScan = BinBoundingBoxesScan; + binCentroidBoundingBoxesScan = BinCentroidBoundingBoxesScan; + binLeafCounts = BinLeafCounts; + } + } + + static Buffer Suballocate(Buffer buffer, ref int start, int count) where T : unmanaged + { + var size = count * Unsafe.SizeOf(); + var previousStart = start; + start += size; + return buffer.Slice(previousStart, size).As(); + } + + /// + /// Stores resources required by a worker to dispatch and manage multithreaded work. + /// + /// + /// Some of the resources cached here are technically redundant with the storage used for workers and ends up involving an extra bin scan on a multithreaded test, + /// but the cost associated with doing so is... low. The complexity cost of trying to use the memory allocated for workers is not low. + /// + struct BinnedBuildWorkerContext + { + /// + /// Bins associated with this worker for the duration of a node. This allocation will persist across the build. + /// + public Buffer BinBoundingBoxes; /// - /// Random value stored in the upper 32 bits of the job tag submitted for internal multithreading operations. + /// Centroid bound bins associated with this worker for the duration of a node. This allocation will persist across the build. /// - /// Other systems using the same task stack may want to use their own filtering approaches. By using a very specific and unique signature, those other systems are less likely to accidentally collide. - const ulong JobFilterTagHeader = 0xB0A1BF32ul << 32; + public Buffer BinCentroidBoundingBoxes; + /// + /// Bins associated with this worker for use in the SAH scan. This allocation will persist across the build. + /// + public Buffer BinBoundingBoxesScan; + /// + /// Centroid bound bins associated with this worker for use in the SAH scan. This allocation will persist across the build. + /// + public Buffer BinCentroidBoundingBoxesScan; + /// + /// Bin leaf counts associated with this worker for the duration of a node. This allocation will persist across the build. + /// + public Buffer BinLeafCounts; - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static BoundingBox4 ComputeCentroidBounds(Buffer bounds) + public BinnedBuildWorkerContext(Buffer binAllocationBuffer, ref int binStart, int binCapacity) { - BoundingBox4 centroidBounds; - centroidBounds.Min = new Vector4(float.MaxValue); - centroidBounds.Max = new Vector4(float.MinValue); - for (int i = 0; i < bounds.Length; ++i) - { - ref var box = ref bounds[i]; - //Note that centroids never bother scaling by 0.5. It's fine as long as we're consistent. - var centroid = box.Min + box.Max; - centroidBounds.Min = Vector4.Min(centroidBounds.Min, centroid); - centroidBounds.Max = Vector4.Max(centroidBounds.Max, centroid); - } - return centroidBounds; + BinBoundingBoxes = Suballocate(binAllocationBuffer, ref binStart, binCapacity); + BinCentroidBoundingBoxes = Suballocate(binAllocationBuffer, ref binStart, binCapacity); + BinBoundingBoxesScan = Suballocate(binAllocationBuffer, ref binStart, binCapacity); + BinCentroidBoundingBoxesScan = Suballocate(binAllocationBuffer, ref binStart, binCapacity); + BinLeafCounts = Suballocate(binAllocationBuffer, ref binStart, binCapacity); } + } + unsafe struct MultithreadBinnedBuildContext : IBinnedBuilderThreading + { + public TaskStack* TaskStack; + /// + /// The number of subtrees present at the root of the build. + /// + public int OriginalSubtreeCount; + /// + /// The target number of tasks that would be used for the root node. Later nodes will tend to target smaller numbers of tasks on the assumption that other parallel nodes will provide enough work to fill in the gaps. + /// + public int TopLevelTargetTaskCount; + public Buffer Workers; - struct SharedTaskData + public void GetBins(int workerIndex, + out Buffer binBoundingBoxes, out Buffer binCentroidBoundingBoxes, + out Buffer binBoundingBoxesScan, out Buffer binCentroidBoundingBoxesScan, out Buffer binLeafCounts) { - public int WorkerCount; - public int TaskCount; + ref var worker = ref Workers[workerIndex]; + binBoundingBoxes = worker.BinBoundingBoxes; + binCentroidBoundingBoxes = worker.BinCentroidBoundingBoxes; + binBoundingBoxesScan = worker.BinBoundingBoxesScan; + binCentroidBoundingBoxesScan = worker.BinCentroidBoundingBoxesScan; + binLeafCounts = worker.BinLeafCounts; + } - public int SubtreeStartIndex; - public int SubtreeCount; + public int GetTargetTaskCountForInnerLoop(int subtreeCount) + { + return (int)float.Ceiling(TopLevelTargetTaskCount * (float)subtreeCount / OriginalSubtreeCount); + } + public int GetTargetTaskCountForNodes(int subtreeCount) + { + return (int)float.Ceiling(TargetTaskCountMultiplierForNodePushOverInnerLoop * TopLevelTargetTaskCount * (float)subtreeCount / OriginalSubtreeCount); + } + } - public int SlotsPerTaskBase; - public int SlotRemainder; - public bool TaskCountFitsInWorkerCount; + const int MinimumSubtreesPerThreadForCentroidPrepass = 1024; + const int MinimumSubtreesPerThreadForBinning = 1024; + const int MinimumSubtreesPerThreadForPartitioning = 1024; + const int MinimumSubtreesPerThreadForNodeJob = 256; + const int TargetTaskCountMultiplierForNodePushOverInnerLoop = 8; + /// + /// Random value stored in the upper 32 bits of the job tag submitted for internal multithreading operations. + /// + /// Other systems using the same task stack may want to use their own filtering approaches. By using a very specific and unique signature, those other systems are less likely to accidentally collide. + const ulong JobFilterTagHeader = 0xB0A1BF32ul << 32; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static BoundingBox4 ComputeCentroidBounds(Buffer bounds) + { + BoundingBox4 centroidBounds; + centroidBounds.Min = new Vector4(float.MaxValue); + centroidBounds.Max = new Vector4(float.MinValue); + for (int i = 0; i < bounds.Length; ++i) + { + ref var box = ref bounds[i]; + //Note that centroids never bother scaling by 0.5. It's fine as long as we're consistent. + var centroid = box.Min + box.Max; + centroidBounds.Min = Vector4.Min(centroidBounds.Min, centroid); + centroidBounds.Max = Vector4.Max(centroidBounds.Max, centroid); + } + return centroidBounds; + } - public SharedTaskData(int workerCount, int subtreeStartIndex, int slotCount, - int minimumSlotsPerTask, int targetTaskCount) - { - WorkerCount = workerCount; - var taskSize = int.Max(minimumSlotsPerTask, slotCount / targetTaskCount); - TaskCount = (slotCount + taskSize - 1) / taskSize; - SubtreeStartIndex = subtreeStartIndex; - SubtreeCount = slotCount; - SlotsPerTaskBase = slotCount / TaskCount; - SlotRemainder = slotCount - TaskCount * SlotsPerTaskBase; - TaskCountFitsInWorkerCount = TaskCount <= WorkerCount; - } + struct SharedTaskData + { + public int WorkerCount; + public int TaskCount; - public void GetSlotInterval(long taskId, out int start, out int count) - { - var remainderedTaskCount = int.Min(SlotRemainder, (int)taskId); - var earlySlotCount = (SlotsPerTaskBase + 1) * remainderedTaskCount; - var lateSlotCount = SlotsPerTaskBase * (taskId - remainderedTaskCount); - start = SubtreeStartIndex + (int)(earlySlotCount + lateSlotCount); - count = taskId >= SlotRemainder ? SlotsPerTaskBase : SlotsPerTaskBase + 1; - } - } + public int SubtreeStartIndex; + public int SubtreeCount; - struct CentroidPrepassTaskContext - { - public SharedTaskData TaskData; - /// - /// Stores per-worker prepass bounds accumulated over multiple tasks. If there are less tasks than workers, then only the lower contiguous region of these bounds are used. - /// This allocation is ephemeral; it is allocated from the current worker when needed. - /// Note that the allocation occurs on the loop dispatching thread: the workers that help with the loop do not have to allocate anything themselves. - /// - public Buffer PrepassWorkers; - /// - /// Buffer containing the bounding boxes of all subtrees in the node. - /// - public Buffer Bounds; - - public CentroidPrepassTaskContext(BufferPool pool, SharedTaskData taskData, Buffer bounds) - { - TaskData = taskData; - pool.Take(int.Min(taskData.WorkerCount, taskData.TaskCount), out PrepassWorkers); - Debug.Assert(PrepassWorkers.Length >= 2); - Bounds = bounds; - } + public int SlotsPerTaskBase; + public int SlotRemainder; + public bool TaskCountFitsInWorkerCount; - public void Dispose(BufferPool pool) => pool.Return(ref PrepassWorkers); + public SharedTaskData(int workerCount, int subtreeStartIndex, int slotCount, + int minimumSlotsPerTask, int targetTaskCount) + { + WorkerCount = workerCount; + var taskSize = int.Max(minimumSlotsPerTask, slotCount / targetTaskCount); + TaskCount = (slotCount + taskSize - 1) / taskSize; + SubtreeStartIndex = subtreeStartIndex; + SubtreeCount = slotCount; + SlotsPerTaskBase = slotCount / TaskCount; + SlotRemainder = slotCount - TaskCount * SlotsPerTaskBase; + TaskCountFitsInWorkerCount = TaskCount <= WorkerCount; } - unsafe static void CentroidPrepassWorker(long taskId, void* untypedContext, int workerIndex, IThreadDispatcher dispatcher) + + public void GetSlotInterval(long taskId, out int start, out int count) { - ref var context = ref *(CentroidPrepassTaskContext*)untypedContext; - Debug.Assert(context.TaskData.WorkerCount > 1 && context.TaskData.TaskCount > 1 && context.TaskData.WorkerCount < 100); - context.TaskData.GetSlotInterval(taskId, out var start, out var count); - var centroidBounds = ComputeCentroidBounds(context.Bounds.Slice(start, count)); - if (context.TaskData.TaskCountFitsInWorkerCount) - { - //There were less tasks than workers; directly write into the slot without bothering to merge. - context.PrepassWorkers[(int)taskId] = centroidBounds; - } - else - { - ref var workerBounds = ref context.PrepassWorkers[workerIndex]; - workerBounds.Min = Vector4.Min(workerBounds.Min, centroidBounds.Min); - workerBounds.Max = Vector4.Max(workerBounds.Max, centroidBounds.Max); - } + var remainderedTaskCount = int.Min(SlotRemainder, (int)taskId); + var earlySlotCount = (SlotsPerTaskBase + 1) * remainderedTaskCount; + var lateSlotCount = SlotsPerTaskBase * (taskId - remainderedTaskCount); + start = SubtreeStartIndex + (int)(earlySlotCount + lateSlotCount); + count = taskId >= SlotRemainder ? SlotsPerTaskBase : SlotsPerTaskBase + 1; } + } - unsafe static BoundingBox4 MultithreadedCentroidPrepass(MultithreadBinnedBuildContext* context, Buffer bounds, in SharedTaskData taskData, int workerIndex, IThreadDispatcher dispatcher) + struct CentroidPrepassTaskContext + { + public SharedTaskData TaskData; + /// + /// Stores per-worker prepass bounds accumulated over multiple tasks. If there are less tasks than workers, then only the lower contiguous region of these bounds are used. + /// This allocation is ephemeral; it is allocated from the current worker when needed. + /// Note that the allocation occurs on the loop dispatching thread: the workers that help with the loop do not have to allocate anything themselves. + /// + public Buffer PrepassWorkers; + /// + /// Buffer containing the bounding boxes of all subtrees in the node. + /// + public Buffer Bounds; + + public CentroidPrepassTaskContext(BufferPool pool, SharedTaskData taskData, Buffer bounds) { - ref var worker = ref context->Workers[workerIndex]; - var workerPool = dispatcher.WorkerPools[workerIndex]; - var taskContext = new CentroidPrepassTaskContext(workerPool, taskData, bounds); - var taskCount = taskContext.TaskData.TaskCount; - //Don't bother initializing more slots than we have tasks. Note that this requires special handling on the task level; - //if we have less tasks than workers, then the task needs to distinguish that fact. - var activeWorkerCount = int.Min(taskContext.TaskData.WorkerCount, taskCount); - if (taskCount > taskContext.TaskData.WorkerCount) - { - //Potentially multiple tasks per worker; we must preinitialize slots. - for (int i = 0; i < activeWorkerCount; ++i) - { - ref var workerBounds = ref taskContext.PrepassWorkers[i]; - workerBounds.Min = new Vector4(float.MaxValue); - workerBounds.Max = new Vector4(float.MinValue); - } - } - Debug.Assert(taskContext.TaskData.TaskCount > 0 && taskContext.TaskData.WorkerCount > 0); - //We only want the inner multithreading to work on small, non-recursive jobs. - //Diving into a node at this point would stall the current node and favor more (and smaller) nodes. - //(Note: the centroid prepass only runs at the root, so we don't expect there to be any competition from other nodes *in this tree*, - //but it's possible that the same taskstack is used from multiple binned builds. - //Technically, there's potential interference from other user tasks that have nothing to do with binned building, but... not too concerned at this point.) - var tagValue = (uint)workerIndex | JobFilterTagHeader; - var jobFilter = new EqualTagFilter(tagValue); - context->TaskStack->For(&CentroidPrepassWorker, &taskContext, 0, taskCount, workerIndex, dispatcher, ref jobFilter, tagValue); - - var centroidBounds = taskContext.PrepassWorkers[0]; - for (int i = 1; i < activeWorkerCount; ++i) - { - ref var workerBounds = ref taskContext.PrepassWorkers[i]; - centroidBounds.Min = Vector4.Min(workerBounds.Min, centroidBounds.Min); - centroidBounds.Max = Vector4.Max(workerBounds.Max, centroidBounds.Max); - } - taskContext.Dispose(workerPool); - return centroidBounds; + TaskData = taskData; + pool.Take(int.Min(taskData.WorkerCount, taskData.TaskCount), out PrepassWorkers); + Debug.Assert(PrepassWorkers.Length >= 2); + Bounds = bounds; } - struct BinSubtreesWorkerContext + public void Dispose(BufferPool pool) => pool.Return(ref PrepassWorkers); + } + unsafe static void CentroidPrepassWorker(long taskId, void* untypedContext, int workerIndex, IThreadDispatcher dispatcher) + { + ref var context = ref *(CentroidPrepassTaskContext*)untypedContext; + Debug.Assert(context.TaskData.WorkerCount > 1 && context.TaskData.TaskCount > 1 && context.TaskData.WorkerCount < 100); + context.TaskData.GetSlotInterval(taskId, out var start, out var count); + var centroidBounds = ComputeCentroidBounds(context.Bounds.Slice(start, count)); + if (context.TaskData.TaskCountFitsInWorkerCount) + { + //There were less tasks than workers; directly write into the slot without bothering to merge. + context.PrepassWorkers[(int)taskId] = centroidBounds; + } + else { - public Buffer BinBoundingBoxes; - public Buffer BinCentroidBoundingBoxes; - public Buffer BinLeafCounts; + ref var workerBounds = ref context.PrepassWorkers[workerIndex]; + workerBounds.Min = Vector4.Min(workerBounds.Min, centroidBounds.Min); + workerBounds.Max = Vector4.Max(workerBounds.Max, centroidBounds.Max); } - unsafe struct BinSubtreesTaskContext + } + + unsafe static BoundingBox4 MultithreadedCentroidPrepass(MultithreadBinnedBuildContext* context, Buffer bounds, in SharedTaskData taskData, int workerIndex, IThreadDispatcher dispatcher) + { + ref var worker = ref context->Workers[workerIndex]; + var workerPool = dispatcher.WorkerPools[workerIndex]; + var taskContext = new CentroidPrepassTaskContext(workerPool, taskData, bounds); + var taskCount = taskContext.TaskData.TaskCount; + //Don't bother initializing more slots than we have tasks. Note that this requires special handling on the task level; + //if we have less tasks than workers, then the task needs to distinguish that fact. + var activeWorkerCount = int.Min(taskContext.TaskData.WorkerCount, taskCount); + if (taskCount > taskContext.TaskData.WorkerCount) { - public SharedTaskData TaskData; - /// - /// Bins associated with any workers that end up contributing to this worker's dispatch of a binning loop. If there are less tasks than workers, then only the lower contiguous region of these bounds are used. - /// This allocation is ephemeral; it is allocated from the current worker when needed. - /// Note that the allocation occurs on the loop dispatching thread: the workers that help with the loop do not have to allocate anything themselves. - /// - public Buffer BinSubtreesWorkers; - /// - /// Whether a given worker contributed to the subtree binning process. If this worker did not contribute, there's no reason to merge its bins. - /// This allocation is ephemeral; it is allocated from the current worker when needed. - /// Note that the allocation occurs on the loop dispatching thread: the workers that help with the loop do not have to allocate anything themselves. - /// - public Buffer WorkerHelpedWithBinning; - - /// - /// Buffer containing all subtrees in this node. - /// - public Buffer Subtrees; - - /// - /// Stores the bin indices of all subtrees in the node. - /// - public Buffer BinIndices; - - public int BinCount; - public bool UseX, UseY; - public Vector128 PermuteMask; - public int AxisIndex; - public Vector4 CentroidBoundsMin; - public Vector4 OffsetToBinIndex; - public Vector4 MaximumBinIndex; - - public BinSubtreesTaskContext(BufferPool pool, SharedTaskData taskData, Buffer subtrees, Buffer binIndices, - int binCount, bool useX, bool useY, Vector128 permuteMask, int axisIndex, - Vector4 centroidBoundsMin, Vector4 offsetToBinIndex, Vector4 maximumBinIndex) + //Potentially multiple tasks per worker; we must preinitialize slots. + for (int i = 0; i < activeWorkerCount; ++i) { - TaskData = taskData; - Subtrees = subtrees; - BinIndices = binIndices; - BinCount = binCount; - UseX = useX; - UseY = useY; - PermuteMask = permuteMask; - AxisIndex = axisIndex; - CentroidBoundsMin = centroidBoundsMin; - OffsetToBinIndex = offsetToBinIndex; - MaximumBinIndex = maximumBinIndex; - var effectiveWorkerCount = int.Min(taskData.WorkerCount, taskData.TaskCount); - //Pull one allocation from the pool instead of 1 + workerCount * 2. Slight reduction in overhead. Note that this means we only need to return one buffer of the associated id at the end! - var allocationSize = (sizeof(BinSubtreesWorkerContext) + (sizeof(BoundingBox4) * 2 + sizeof(int)) * binCount + sizeof(bool) * taskData.WorkerCount) * effectiveWorkerCount; - pool.Take(allocationSize, out var allocation); - int start = 0; - BinSubtreesWorkers = Suballocate(allocation, ref start, effectiveWorkerCount); - for (int i = 0; i < effectiveWorkerCount; ++i) - { - ref var worker = ref BinSubtreesWorkers[i]; - worker.BinBoundingBoxes = Suballocate(allocation, ref start, BinCount); - worker.BinCentroidBoundingBoxes = Suballocate(allocation, ref start, BinCount); - worker.BinLeafCounts = Suballocate(allocation, ref start, BinCount); - } - WorkerHelpedWithBinning = Suballocate(allocation, ref start, effectiveWorkerCount); - WorkerHelpedWithBinning.Clear(0, effectiveWorkerCount); + ref var workerBounds = ref taskContext.PrepassWorkers[i]; + workerBounds.Min = new Vector4(float.MaxValue); + workerBounds.Max = new Vector4(float.MinValue); } - public void Dispose(BufferPool pool) => pool.Return(ref BinSubtreesWorkers); //Only need to return the main buffer because all the other allocations share the same id! } - //these type-level booleans let the compiler avoid branching in the binning loop. The bin indices buffer is not guaranteed to exist. - //i apologize + Debug.Assert(taskContext.TaskData.TaskCount > 0 && taskContext.TaskData.WorkerCount > 0); + //We only want the inner multithreading to work on small, non-recursive jobs. + //Diving into a node at this point would stall the current node and favor more (and smaller) nodes. + //(Note: the centroid prepass only runs at the root, so we don't expect there to be any competition from other nodes *in this tree*, + //but it's possible that the same taskstack is used from multiple binned builds. + //Technically, there's potential interference from other user tasks that have nothing to do with binned building, but... not too concerned at this point.) + var tagValue = (uint)workerIndex | JobFilterTagHeader; + var jobFilter = new EqualTagFilter(tagValue); + context->TaskStack->For(&CentroidPrepassWorker, &taskContext, 0, taskCount, workerIndex, dispatcher, ref jobFilter, tagValue); + + var centroidBounds = taskContext.PrepassWorkers[0]; + for (int i = 1; i < activeWorkerCount; ++i) + { + ref var workerBounds = ref taskContext.PrepassWorkers[i]; + centroidBounds.Min = Vector4.Min(workerBounds.Min, centroidBounds.Min); + centroidBounds.Max = Vector4.Max(workerBounds.Max, centroidBounds.Max); + } + taskContext.Dispose(workerPool); + return centroidBounds; + } + + struct BinSubtreesWorkerContext + { + public Buffer BinBoundingBoxes; + public Buffer BinCentroidBoundingBoxes; + public Buffer BinLeafCounts; + } + unsafe struct BinSubtreesTaskContext + { + public SharedTaskData TaskData; /// - /// Marks a call as requiring the bin indices to be written to the binIndices buffer. + /// Bins associated with any workers that end up contributing to this worker's dispatch of a binning loop. If there are less tasks than workers, then only the lower contiguous region of these bounds are used. + /// This allocation is ephemeral; it is allocated from the current worker when needed. + /// Note that the allocation occurs on the loop dispatching thread: the workers that help with the loop do not have to allocate anything themselves. /// - private struct DoWriteBinIndices { } + public Buffer BinSubtreesWorkers; /// - /// Marks a call as not allowing the bin indices to be written to the binIndices buffer. + /// Whether a given worker contributed to the subtree binning process. If this worker did not contribute, there's no reason to merge its bins. + /// This allocation is ephemeral; it is allocated from the current worker when needed. + /// Note that the allocation occurs on the loop dispatching thread: the workers that help with the loop do not have to allocate anything themselves. /// - private struct DoNotWriteBinIndices { } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - static void BinSubtrees(Vector4 centroidBoundsMin, - bool useX, bool useY, Vector128 permuteMask, int axisIndex, Vector4 offsetToBinIndex, Vector4 maximumBinIndex, - Buffer subtrees, Buffer binBoundingBoxes, Buffer binCentroidBoundingBoxes, Buffer binLeafCounts, Buffer binIndices) - where TShouldWriteBinIndices : unmanaged + public Buffer WorkerHelpedWithBinning; + + /// + /// Buffer containing all subtrees in this node. + /// + public Buffer Subtrees; + + /// + /// Stores the bin indices of all subtrees in the node. + /// + public Buffer BinIndices; + + public int BinCount; + public bool UseX, UseY; + public Vector128 PermuteMask; + public int AxisIndex; + public Vector4 CentroidBoundsMin; + public Vector4 OffsetToBinIndex; + public Vector4 MaximumBinIndex; + + public BinSubtreesTaskContext(BufferPool pool, SharedTaskData taskData, Buffer subtrees, Buffer binIndices, + int binCount, bool useX, bool useY, Vector128 permuteMask, int axisIndex, + Vector4 centroidBoundsMin, Vector4 offsetToBinIndex, Vector4 maximumBinIndex) + { + TaskData = taskData; + Subtrees = subtrees; + BinIndices = binIndices; + BinCount = binCount; + UseX = useX; + UseY = useY; + PermuteMask = permuteMask; + AxisIndex = axisIndex; + CentroidBoundsMin = centroidBoundsMin; + OffsetToBinIndex = offsetToBinIndex; + MaximumBinIndex = maximumBinIndex; + var effectiveWorkerCount = int.Min(taskData.WorkerCount, taskData.TaskCount); + //Pull one allocation from the pool instead of 1 + workerCount * 2. Slight reduction in overhead. Note that this means we only need to return one buffer of the associated id at the end! + var allocationSize = (sizeof(BinSubtreesWorkerContext) + (sizeof(BoundingBox4) * 2 + sizeof(int)) * binCount + sizeof(bool) * taskData.WorkerCount) * effectiveWorkerCount; + pool.Take(allocationSize, out var allocation); + int start = 0; + BinSubtreesWorkers = Suballocate(allocation, ref start, effectiveWorkerCount); + for (int i = 0; i < effectiveWorkerCount; ++i) + { + ref var worker = ref BinSubtreesWorkers[i]; + worker.BinBoundingBoxes = Suballocate(allocation, ref start, BinCount); + worker.BinCentroidBoundingBoxes = Suballocate(allocation, ref start, BinCount); + worker.BinLeafCounts = Suballocate(allocation, ref start, BinCount); + } + WorkerHelpedWithBinning = Suballocate(allocation, ref start, effectiveWorkerCount); + WorkerHelpedWithBinning.Clear(0, effectiveWorkerCount); + } + public void Dispose(BufferPool pool) => pool.Return(ref BinSubtreesWorkers); //Only need to return the main buffer because all the other allocations share the same id! + } + //these type-level booleans let the compiler avoid branching in the binning loop. The bin indices buffer is not guaranteed to exist. + //i apologize + /// + /// Marks a call as requiring the bin indices to be written to the binIndices buffer. + /// + private struct DoWriteBinIndices { } + /// + /// Marks a call as not allowing the bin indices to be written to the binIndices buffer. + /// + private struct DoNotWriteBinIndices { } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + static void BinSubtrees(Vector4 centroidBoundsMin, + bool useX, bool useY, Vector128 permuteMask, int axisIndex, Vector4 offsetToBinIndex, Vector4 maximumBinIndex, + Buffer subtrees, Buffer binBoundingBoxes, Buffer binCentroidBoundingBoxes, Buffer binLeafCounts, Buffer binIndices) + where TShouldWriteBinIndices : unmanaged + { + //Note that we don't store out any of the indices into per-bin lists here. We only *really* want two final groups for the children, + //and we can easily compute those by performing another scan. It requires recomputing the bin indices, but that's really not much of a concern. + for (int i = 0; i < subtrees.Length; ++i) + { + ref var subtree = ref subtrees[i]; + ref var box = ref Unsafe.As(ref subtree); + var binIndex = ComputeBinIndex(centroidBoundsMin, useX, useY, permuteMask, axisIndex, offsetToBinIndex, maximumBinIndex, box); + if (typeof(TShouldWriteBinIndices) == typeof(DoWriteBinIndices)) + binIndices[i] = (byte)binIndex; + ref var binBounds = ref binBoundingBoxes[binIndex]; + binBounds.Min = Vector4.Min(binBounds.Min, box.Min); + binBounds.Max = Vector4.Max(binBounds.Max, box.Max); + //The binning phase also keeps track of *centroid* bounding boxes so that we don't have to do a dedicated centroid prepass for each node. + //(A centroid prepass would require touching every single subtree again, and, for large trees, that's a lot of uncached (or distant) memory accesses.) + var centroid = box.Min + box.Max; + ref var binCentroidBounds = ref binCentroidBoundingBoxes[binIndex]; + binCentroidBounds.Min = Vector4.Min(binCentroidBounds.Min, centroid); + binCentroidBounds.Max = Vector4.Max(binCentroidBounds.Max, centroid); + binLeafCounts[binIndex] += subtree.LeafCount; + } + } + unsafe static void BinSubtreesWorker(long taskId, void* untypedContext, int workerIndex, IThreadDispatcher dispatcher) + { + ref var context = ref *(BinSubtreesTaskContext*)untypedContext; + Debug.Assert(context.TaskData.WorkerCount > 1 && context.TaskData.TaskCount > 1 && context.TaskData.WorkerCount < 100); + //Note that if we have more workers than tasks, we use the task id to index into the caches (and initialize the data here rather then before dispatching). + var effectiveWorkerIndex = context.TaskData.TaskCountFitsInWorkerCount ? (int)taskId : workerIndex; + ref var worker = ref context.BinSubtreesWorkers[effectiveWorkerIndex]; + context.WorkerHelpedWithBinning[effectiveWorkerIndex] = true; + if (context.TaskData.TaskCountFitsInWorkerCount) { - //Note that we don't store out any of the indices into per-bin lists here. We only *really* want two final groups for the children, - //and we can easily compute those by performing another scan. It requires recomputing the bin indices, but that's really not much of a concern. - for (int i = 0; i < subtrees.Length; ++i) + for (int i = 0; i < context.BinCount; ++i) { - ref var subtree = ref subtrees[i]; - ref var box = ref Unsafe.As(ref subtree); - var binIndex = ComputeBinIndex(centroidBoundsMin, useX, useY, permuteMask, axisIndex, offsetToBinIndex, maximumBinIndex, box); - if (typeof(TShouldWriteBinIndices) == typeof(DoWriteBinIndices)) - binIndices[i] = (byte)binIndex; - ref var binBounds = ref binBoundingBoxes[binIndex]; - binBounds.Min = Vector4.Min(binBounds.Min, box.Min); - binBounds.Max = Vector4.Max(binBounds.Max, box.Max); - //The binning phase also keeps track of *centroid* bounding boxes so that we don't have to do a dedicated centroid prepass for each node. - //(A centroid prepass would require touching every single subtree again, and, for large trees, that's a lot of uncached (or distant) memory accesses.) - var centroid = box.Min + box.Max; - ref var binCentroidBounds = ref binCentroidBoundingBoxes[binIndex]; - binCentroidBounds.Min = Vector4.Min(binCentroidBounds.Min, centroid); - binCentroidBounds.Max = Vector4.Max(binCentroidBounds.Max, centroid); - binLeafCounts[binIndex] += subtree.LeafCount; + ref var binBounds = ref worker.BinBoundingBoxes[i]; + binBounds.Min = new Vector4(float.MaxValue); + binBounds.Max = new Vector4(float.MinValue); + ref var binCentroidBounds = ref worker.BinCentroidBoundingBoxes[i]; + binCentroidBounds.Min = new Vector4(float.MaxValue); + binCentroidBounds.Max = new Vector4(float.MinValue); + worker.BinLeafCounts[i] = 0; } } - unsafe static void BinSubtreesWorker(long taskId, void* untypedContext, int workerIndex, IThreadDispatcher dispatcher) + context.TaskData.GetSlotInterval(taskId, out var start, out var count); + //We always write bin indices, because threading always has a bufferpool available to allocate bin indices from. + Debug.Assert(context.BinIndices.Allocated); + BinSubtrees(context.CentroidBoundsMin, context.UseX, context.UseY, context.PermuteMask, context.AxisIndex, context.OffsetToBinIndex, context.MaximumBinIndex, + context.Subtrees.Slice(start, count), worker.BinBoundingBoxes, worker.BinCentroidBoundingBoxes, worker.BinLeafCounts, context.BinIndices.Slice(start, count)); + } + + unsafe static void MultithreadedBinSubtrees(MultithreadBinnedBuildContext* context, + Vector4 centroidBoundsMin, bool useX, bool useY, Vector128 permuteMask, int axisIndex, Vector4 offsetToBinIndex, Vector4 maximumBinIndex, + Buffer subtrees, Buffer subtreeBinIndices, int binCount, in SharedTaskData taskData, int workerIndex, IThreadDispatcher dispatcher) + { + ref var worker = ref context->Workers[workerIndex]; + var workerPool = dispatcher.WorkerPools[workerIndex]; + var taskContext = new BinSubtreesTaskContext( + workerPool, taskData, subtrees, subtreeBinIndices, binCount, useX, useY, permuteMask, axisIndex, centroidBoundsMin, offsetToBinIndex, maximumBinIndex); + + //Don't bother initializing more slots than we have tasks. Note that this requires special handling on the task level; + //if we have less tasks than workers, then the task needs to distinguish that fact. + var activeWorkerCount = int.Min(context->Workers.Length, taskContext.TaskData.TaskCount); + if (!taskContext.TaskData.TaskCountFitsInWorkerCount) { - ref var context = ref *(BinSubtreesTaskContext*)untypedContext; - Debug.Assert(context.TaskData.WorkerCount > 1 && context.TaskData.TaskCount > 1 && context.TaskData.WorkerCount < 100); - //Note that if we have more workers than tasks, we use the task id to index into the caches (and initialize the data here rather then before dispatching). - var effectiveWorkerIndex = context.TaskData.TaskCountFitsInWorkerCount ? (int)taskId : workerIndex; - ref var worker = ref context.BinSubtreesWorkers[effectiveWorkerIndex]; - context.WorkerHelpedWithBinning[effectiveWorkerIndex] = true; - if (context.TaskData.TaskCountFitsInWorkerCount) + //If there are more tasks than workers, then we need to preinitialize all the worker caches. + for (int cacheIndex = 0; cacheIndex < activeWorkerCount; ++cacheIndex) { - for (int i = 0; i < context.BinCount; ++i) + ref var cache = ref taskContext.BinSubtreesWorkers[cacheIndex]; + for (int i = 0; i < binCount; ++i) { - ref var binBounds = ref worker.BinBoundingBoxes[i]; + ref var binBounds = ref cache.BinBoundingBoxes[i]; binBounds.Min = new Vector4(float.MaxValue); binBounds.Max = new Vector4(float.MinValue); - ref var binCentroidBounds = ref worker.BinCentroidBoundingBoxes[i]; + ref var binCentroidBounds = ref cache.BinCentroidBoundingBoxes[i]; binCentroidBounds.Min = new Vector4(float.MaxValue); binCentroidBounds.Max = new Vector4(float.MinValue); - worker.BinLeafCounts[i] = 0; + cache.BinLeafCounts[i] = 0; } } - context.TaskData.GetSlotInterval(taskId, out var start, out var count); - //We always write bin indices, because threading always has a bufferpool available to allocate bin indices from. - Debug.Assert(context.BinIndices.Allocated); - BinSubtrees(context.CentroidBoundsMin, context.UseX, context.UseY, context.PermuteMask, context.AxisIndex, context.OffsetToBinIndex, context.MaximumBinIndex, - context.Subtrees.Slice(start, count), worker.BinBoundingBoxes, worker.BinCentroidBoundingBoxes, worker.BinLeafCounts, context.BinIndices.Slice(start, count)); } - unsafe static void MultithreadedBinSubtrees(MultithreadBinnedBuildContext* context, - Vector4 centroidBoundsMin, bool useX, bool useY, Vector128 permuteMask, int axisIndex, Vector4 offsetToBinIndex, Vector4 maximumBinIndex, - Buffer subtrees, Buffer subtreeBinIndices, int binCount, in SharedTaskData taskData, int workerIndex, IThreadDispatcher dispatcher) + //We only want the inner multithreading to work on small, non-recursive jobs. + //Diving into a node at this point would stall the current node and favor more (and smaller) nodes. + var tagValue = (uint)workerIndex | JobFilterTagHeader; + var jobFilter = new EqualTagFilter(tagValue); + context->TaskStack->For(&BinSubtreesWorker, &taskContext, 0, taskContext.TaskData.TaskCount, workerIndex, dispatcher, ref jobFilter, tagValue); + + //Unless the number of threads and bins is really huge, there's no value in attempting to multithread the final compression. + //(Parallel reduction is an option, but even then... I suspect the single threaded version will be faster. And it's way simpler.) + //Note that we have a separate merging target from the caches; that just makes resource management easier. + //We can dispose the worker stuff immediately after this merge. + //(Consider what happens in the case where the single threaded path is used: you need an allocation! would you allocate a bunch of multithreaded workers for it? + //That's not an irrelevant case, either. *Most* nodes will be too small to warrant internal multithreading.) + ref var cache0 = ref taskContext.BinSubtreesWorkers[0]; + cache0.BinBoundingBoxes.CopyTo(0, worker.BinBoundingBoxes, 0, cache0.BinBoundingBoxes.Length); + cache0.BinCentroidBoundingBoxes.CopyTo(0, worker.BinCentroidBoundingBoxes, 0, cache0.BinCentroidBoundingBoxes.Length); + cache0.BinLeafCounts.CopyTo(0, worker.BinLeafCounts, 0, cache0.BinLeafCounts.Length); + for (int cacheIndex = 1; cacheIndex < activeWorkerCount; ++cacheIndex) { - ref var worker = ref context->Workers[workerIndex]; - var workerPool = dispatcher.WorkerPools[workerIndex]; - var taskContext = new BinSubtreesTaskContext( - workerPool, taskData, subtrees, subtreeBinIndices, binCount, useX, useY, permuteMask, axisIndex, centroidBoundsMin, offsetToBinIndex, maximumBinIndex); - - //Don't bother initializing more slots than we have tasks. Note that this requires special handling on the task level; - //if we have less tasks than workers, then the task needs to distinguish that fact. - var activeWorkerCount = int.Min(context->Workers.Length, taskContext.TaskData.TaskCount); - if (!taskContext.TaskData.TaskCountFitsInWorkerCount) + //Only bother merging from workers that actually did anything. + if (taskContext.WorkerHelpedWithBinning[cacheIndex]) { - //If there are more tasks than workers, then we need to preinitialize all the worker caches. - for (int cacheIndex = 0; cacheIndex < activeWorkerCount; ++cacheIndex) + ref var cache = ref taskContext.BinSubtreesWorkers[cacheIndex]; + for (int binIndex = 0; binIndex < binCount; ++binIndex) { - ref var cache = ref taskContext.BinSubtreesWorkers[cacheIndex]; - for (int i = 0; i < binCount; ++i) - { - ref var binBounds = ref cache.BinBoundingBoxes[i]; - binBounds.Min = new Vector4(float.MaxValue); - binBounds.Max = new Vector4(float.MinValue); - ref var binCentroidBounds = ref cache.BinCentroidBoundingBoxes[i]; - binCentroidBounds.Min = new Vector4(float.MaxValue); - binCentroidBounds.Max = new Vector4(float.MinValue); - cache.BinLeafCounts[i] = 0; - } + ref var b0 = ref worker.BinBoundingBoxes[binIndex]; + ref var bi = ref cache.BinBoundingBoxes[binIndex]; + b0.Min = Vector4.Min(b0.Min, bi.Min); + b0.Max = Vector4.Max(b0.Max, bi.Max); + ref var bc0 = ref worker.BinCentroidBoundingBoxes[binIndex]; + ref var bci = ref cache.BinCentroidBoundingBoxes[binIndex]; + bc0.Min = Vector4.Min(bc0.Min, bci.Min); + bc0.Max = Vector4.Max(bc0.Max, bci.Max); + worker.BinLeafCounts[binIndex] += cache.BinLeafCounts[binIndex]; } } - - //We only want the inner multithreading to work on small, non-recursive jobs. - //Diving into a node at this point would stall the current node and favor more (and smaller) nodes. - var tagValue = (uint)workerIndex | JobFilterTagHeader; - var jobFilter = new EqualTagFilter(tagValue); - context->TaskStack->For(&BinSubtreesWorker, &taskContext, 0, taskContext.TaskData.TaskCount, workerIndex, dispatcher, ref jobFilter, tagValue); - - //Unless the number of threads and bins is really huge, there's no value in attempting to multithread the final compression. - //(Parallel reduction is an option, but even then... I suspect the single threaded version will be faster. And it's way simpler.) - //Note that we have a separate merging target from the caches; that just makes resource management easier. - //We can dispose the worker stuff immediately after this merge. - //(Consider what happens in the case where the single threaded path is used: you need an allocation! would you allocate a bunch of multithreaded workers for it? - //That's not an irrelevant case, either. *Most* nodes will be too small to warrant internal multithreading.) - ref var cache0 = ref taskContext.BinSubtreesWorkers[0]; - cache0.BinBoundingBoxes.CopyTo(0, worker.BinBoundingBoxes, 0, cache0.BinBoundingBoxes.Length); - cache0.BinCentroidBoundingBoxes.CopyTo(0, worker.BinCentroidBoundingBoxes, 0, cache0.BinCentroidBoundingBoxes.Length); - cache0.BinLeafCounts.CopyTo(0, worker.BinLeafCounts, 0, cache0.BinLeafCounts.Length); - for (int cacheIndex = 1; cacheIndex < activeWorkerCount; ++cacheIndex) - { - //Only bother merging from workers that actually did anything. - if (taskContext.WorkerHelpedWithBinning[cacheIndex]) - { - ref var cache = ref taskContext.BinSubtreesWorkers[cacheIndex]; - for (int binIndex = 0; binIndex < binCount; ++binIndex) - { - ref var b0 = ref worker.BinBoundingBoxes[binIndex]; - ref var bi = ref cache.BinBoundingBoxes[binIndex]; - b0.Min = Vector4.Min(b0.Min, bi.Min); - b0.Max = Vector4.Max(b0.Max, bi.Max); - ref var bc0 = ref worker.BinCentroidBoundingBoxes[binIndex]; - ref var bci = ref cache.BinCentroidBoundingBoxes[binIndex]; - bc0.Min = Vector4.Min(bc0.Min, bci.Min); - bc0.Max = Vector4.Max(bc0.Max, bci.Max); - worker.BinLeafCounts[binIndex] += cache.BinLeafCounts[binIndex]; - } - } - } - taskContext.Dispose(workerPool); } + taskContext.Dispose(workerPool); + } - [StructLayout(LayoutKind.Explicit, Size = 264)] - struct PartitionCounters - { - //Padding to avoid shared cache lines. - [FieldOffset(128)] - public int SubtreeCountA; - [FieldOffset(134)] - public int SubtreeCountB; - } + [StructLayout(LayoutKind.Explicit, Size = 264)] + struct PartitionCounters + { + //Padding to avoid shared cache lines. + [FieldOffset(128)] + public int SubtreeCountA; + [FieldOffset(134)] + public int SubtreeCountB; + } + + struct PartitionTaskContext + { + public SharedTaskData TaskData; - struct PartitionTaskContext + /// + /// Buffer containing all subtrees in this node. + /// + public Buffer Subtrees; + /// + /// Buffer that will contain the partitioned subtrees pulled from . + /// + public Buffer SubtreesNext; + /// + /// Buffer containing bin indices for all subtrees in the node (encoded with one byte per subtree). + /// + public Buffer BinIndices; + public int BinSplitIndex; + + public PartitionCounters Counters; + + public PartitionTaskContext(SharedTaskData taskData, Buffer subtrees, Buffer subtreesNext, Buffer binIndices, int binSplitIndex) { - public SharedTaskData TaskData; - - /// - /// Buffer containing all subtrees in this node. - /// - public Buffer Subtrees; - /// - /// Buffer that will contain the partitioned subtrees pulled from . - /// - public Buffer SubtreesNext; - /// - /// Buffer containing bin indices for all subtrees in the node (encoded with one byte per subtree). - /// - public Buffer BinIndices; - public int BinSplitIndex; - - public PartitionCounters Counters; - - public PartitionTaskContext(SharedTaskData taskData, Buffer subtrees, Buffer subtreesNext, Buffer binIndices, int binSplitIndex) - { - TaskData = taskData; - Subtrees = subtrees; - SubtreesNext = subtreesNext; - BinIndices = binIndices; - BinSplitIndex = binSplitIndex; + TaskData = taskData; + Subtrees = subtrees; + SubtreesNext = subtreesNext; + BinIndices = binIndices; + BinSplitIndex = binSplitIndex; - Counters = new PartitionCounters(); - } + Counters = new PartitionCounters(); } + } - unsafe static void PartitionSubtreesWorker(long taskId, void* untypedContext, int workerIndex, IThreadDispatcher dispatcher) + unsafe static void PartitionSubtreesWorker(long taskId, void* untypedContext, int workerIndex, IThreadDispatcher dispatcher) + { + ref var context = ref *(PartitionTaskContext*)untypedContext; + Buffer binIndices = context.BinIndices; + context.TaskData.GetSlotInterval(taskId, out var start, out var count); + //We don't really want to trigger interlocked operation for *every single subtree*, but we also don't want to allocate a bunch of memory. + //Compromise! Stackalloc enough memory to cover sub-batches of the worker's subtrees, and do interlocked operations at the end of each batch. + //Note that the main limit to the batch size is the amount of memory in cache. + const int batchSize = 16384; + byte* slotBelongsToA = stackalloc byte[batchSize]; + + var batchCount = (count + batchSize - 1) / batchSize; + var boundingBoxes = context.Subtrees.As(); + var subtrees = context.Subtrees; + var subtreesNext = context.SubtreesNext; + var splitIndexBundle = new Vector((byte)context.BinSplitIndex); + for (int batchIndex = 0; batchIndex < batchCount; ++batchIndex) { - ref var context = ref *(PartitionTaskContext*)untypedContext; - Buffer binIndices = context.BinIndices; - context.TaskData.GetSlotInterval(taskId, out var start, out var count); - //We don't really want to trigger interlocked operation for *every single subtree*, but we also don't want to allocate a bunch of memory. - //Compromise! Stackalloc enough memory to cover sub-batches of the worker's subtrees, and do interlocked operations at the end of each batch. - //Note that the main limit to the batch size is the amount of memory in cache. - const int batchSize = 16384; - byte* slotBelongsToA = stackalloc byte[batchSize]; - - var batchCount = (count + batchSize - 1) / batchSize; - var boundingBoxes = context.Subtrees.As(); - var subtrees = context.Subtrees; - var subtreesNext = context.SubtreesNext; - var splitIndexBundle = new Vector((byte)context.BinSplitIndex); - for (int batchIndex = 0; batchIndex < batchCount; ++batchIndex) - { - var localCountA = 0; - var batchStart = start + batchIndex * batchSize; - var countInBatch = int.Min(start + count - batchStart, batchSize); + var localCountA = 0; + var batchStart = start + batchIndex * batchSize; + var countInBatch = int.Min(start + count - batchStart, batchSize); - int scalarLoopStartIndex; - if (Vector.IsSupported) - { - //Note that the original data is loaded as bytes, but we need wider storage to handle the counts- which could conceivably go up to batchSize. - Vector localCountABundle = Vector.Zero; - scalarLoopStartIndex = (countInBatch / Vector.Count) * Vector.Count; - for (int indexInBatch = 0; indexInBatch < scalarLoopStartIndex; indexInBatch += Vector.Count) - { - var subtreeIndex = indexInBatch + batchStart; - var binIndicesBundle = *(Vector*)(binIndices.Memory + subtreeIndex); - var belongsToABundle = Vector.LessThan(binIndicesBundle, splitIndexBundle); - *(Vector*)(slotBelongsToA + indexInBatch) = belongsToABundle; - var increment = Vector.BitwiseAnd(belongsToABundle, Vector.One); - Vector.Widen(increment, out var low, out var high); - localCountABundle += low + high; - } - localCountA = Vector.Sum(localCountABundle); - } - else - scalarLoopStartIndex = 0; - for (int indexInBatch = scalarLoopStartIndex; indexInBatch < countInBatch; ++indexInBatch) + int scalarLoopStartIndex; + if (Vector.IsSupported) + { + //Note that the original data is loaded as bytes, but we need wider storage to handle the counts- which could conceivably go up to batchSize. + Vector localCountABundle = Vector.Zero; + scalarLoopStartIndex = (countInBatch / Vector.Count) * Vector.Count; + for (int indexInBatch = 0; indexInBatch < scalarLoopStartIndex; indexInBatch += Vector.Count) { var subtreeIndex = indexInBatch + batchStart; - var binIndex = binIndices[subtreeIndex]; - var belongsToA = binIndex < context.BinSplitIndex; - slotBelongsToA[indexInBatch] = belongsToA ? (byte)0xFF : (byte)0; - if (belongsToA) ++localCountA; + var binIndicesBundle = *(Vector*)(binIndices.Memory + subtreeIndex); + var belongsToABundle = Vector.LessThan(binIndicesBundle, splitIndexBundle); + *(Vector*)(slotBelongsToA + indexInBatch) = belongsToABundle; + var increment = Vector.BitwiseAnd(belongsToABundle, Vector.One); + Vector.Widen(increment, out var low, out var high); + localCountABundle += low + high; } + localCountA = Vector.Sum(localCountABundle); + } + else + scalarLoopStartIndex = 0; + for (int indexInBatch = scalarLoopStartIndex; indexInBatch < countInBatch; ++indexInBatch) + { + var subtreeIndex = indexInBatch + batchStart; + var binIndex = binIndices[subtreeIndex]; + var belongsToA = binIndex < context.BinSplitIndex; + slotBelongsToA[indexInBatch] = belongsToA ? (byte)0xFF : (byte)0; + if (belongsToA) ++localCountA; + } - var localCountB = countInBatch - localCountA; - var startIndexA = Interlocked.Add(ref context.Counters.SubtreeCountA, localCountA) - localCountA; - var startIndexB = subtrees.Length - Interlocked.Add(ref context.Counters.SubtreeCountB, localCountB); - - int recountA = 0; - int recountB = 0; - for (int indexInBatch = 0; indexInBatch < countInBatch; ++indexInBatch) - { - var targetIndex = slotBelongsToA[indexInBatch] != 0 ? startIndexA + recountA++ : startIndexB + recountB++; - subtreesNext[targetIndex] = subtrees[batchStart + indexInBatch]; - } + var localCountB = countInBatch - localCountA; + var startIndexA = Interlocked.Add(ref context.Counters.SubtreeCountA, localCountA) - localCountA; + var startIndexB = subtrees.Length - Interlocked.Add(ref context.Counters.SubtreeCountB, localCountB); + int recountA = 0; + int recountB = 0; + for (int indexInBatch = 0; indexInBatch < countInBatch; ++indexInBatch) + { + var targetIndex = slotBelongsToA[indexInBatch] != 0 ? startIndexA + recountA++ : startIndexB + recountB++; + subtreesNext[targetIndex] = subtrees[batchStart + indexInBatch]; } - } - unsafe static (int subtreeCountA, int subtreeCountB) MultithreadedPartition(MultithreadBinnedBuildContext* context, - Buffer subtrees, Buffer subtreesNext, Buffer binIndices, int binSplitIndex, in SharedTaskData taskData, int workerIndex, IThreadDispatcher dispatcher) - { - ref var worker = ref context->Workers[workerIndex]; - var workerPool = dispatcher.WorkerPools[workerIndex]; - var taskContext = new PartitionTaskContext(taskData, subtrees, subtreesNext, binIndices, binSplitIndex); - //We only want the inner multithreading to work on small, non-recursive jobs. - //Diving into a node at this point would stall the current node and favor more (and smaller) nodes. - var tagValue = (uint)workerIndex | JobFilterTagHeader; - var jobFilter = new EqualTagFilter(tagValue); - context->TaskStack->For(&PartitionSubtreesWorker, &taskContext, 0, taskContext.TaskData.TaskCount, workerIndex, dispatcher, ref jobFilter, tagValue); - return (taskContext.Counters.SubtreeCountA, taskContext.Counters.SubtreeCountB); } + } + + unsafe static (int subtreeCountA, int subtreeCountB) MultithreadedPartition(MultithreadBinnedBuildContext* context, + Buffer subtrees, Buffer subtreesNext, Buffer binIndices, int binSplitIndex, in SharedTaskData taskData, int workerIndex, IThreadDispatcher dispatcher) + { + ref var worker = ref context->Workers[workerIndex]; + var workerPool = dispatcher.WorkerPools[workerIndex]; + var taskContext = new PartitionTaskContext(taskData, subtrees, subtreesNext, binIndices, binSplitIndex); + //We only want the inner multithreading to work on small, non-recursive jobs. + //Diving into a node at this point would stall the current node and favor more (and smaller) nodes. + var tagValue = (uint)workerIndex | JobFilterTagHeader; + var jobFilter = new EqualTagFilter(tagValue); + context->TaskStack->For(&PartitionSubtreesWorker, &taskContext, 0, taskContext.TaskData.TaskCount, workerIndex, dispatcher, ref jobFilter, tagValue); + return (taskContext.Counters.SubtreeCountA, taskContext.Counters.SubtreeCountB); + } - unsafe struct NodePushTaskContext - where TLeaves : unmanaged where TThreading : unmanaged, IBinnedBuilderThreading + unsafe struct NodePushTaskContext + where TLeaves : unmanaged where TThreading : unmanaged, IBinnedBuilderThreading + { + public Context* Context; + public int NodeIndex; + public int ParentNodeIndex; + public BoundingBox4 CentroidBounds; + //Subtree region start index, subtree count, and usePongBuffer status are all encoded into the task id. + } + unsafe static void BinnedBuilderNodeWorker(long taskId, void* context, int workerIndex, IThreadDispatcher dispatcher) + where TLeaves : unmanaged where TThreading : unmanaged, IBinnedBuilderThreading + { + var subtreeRegionStartIndex = (int)taskId; + var subtreeCount = (int)((taskId >> 32) & 0x7FFF_FFFF); + var usePongBuffer = (ulong)taskId >= (1UL << 63); + var nodePushContext = (NodePushTaskContext*)context; + //Note that child index is always 1 because we only ever push child B. + BinnedBuildNode(usePongBuffer, subtreeRegionStartIndex, nodePushContext->NodeIndex, subtreeCount, nodePushContext->ParentNodeIndex, 1, nodePushContext->CentroidBounds, nodePushContext->Context, workerIndex, dispatcher); + } + + private static unsafe void BuildNodeForDegeneracy( + Buffer subtrees, Buffer nodes, Buffer metanodes, int nodeIndex, int parentNodeIndex, int childIndexInParent, + Context* context, out int subtreeCountA, out int subtreeCountB, out int aIndex, out int bIndex) + where TLeaves : unmanaged + where TThreading : unmanaged, IBinnedBuilderThreading + { + //This shouldn't happen unless something is badly wrong with the input; no point in optimizing it. + subtreeCountA = subtrees.Length / 2; + subtreeCountB = subtrees.Length - subtreeCountA; + BoundingBox4 boundsA, boundsB; + boundsA.Min = new Vector4(float.MaxValue); + boundsA.Max = new Vector4(float.MinValue); + boundsB.Min = new Vector4(float.MaxValue); + boundsB.Max = new Vector4(float.MinValue); + int leafCountA = 0, leafCountB = 0; + var boundingBoxes = subtrees.As(); + for (int i = 0; i < subtreeCountA; ++i) { - public Context* Context; - public int NodeIndex; - public int ParentNodeIndex; - public BoundingBox4 CentroidBounds; - //Subtree region start index, subtree count, and usePongBuffer status are all encoded into the task id. + ref var bounds = ref boundingBoxes[i]; + boundsA.Min = Vector4.Min(bounds.Min, boundsA.Min); + boundsA.Max = Vector4.Max(bounds.Max, boundsA.Max); + leafCountA += subtrees[i].LeafCount; } - unsafe static void BinnedBuilderNodeWorker(long taskId, void* context, int workerIndex, IThreadDispatcher dispatcher) - where TLeaves : unmanaged where TThreading : unmanaged, IBinnedBuilderThreading + for (int i = subtreeCountA; i < subtrees.Length; ++i) { - var subtreeRegionStartIndex = (int)taskId; - var subtreeCount = (int)((taskId >> 32) & 0x7FFF_FFFF); - var usePongBuffer = (ulong)taskId >= (1UL << 63); - var nodePushContext = (NodePushTaskContext*)context; - //Note that child index is always 1 because we only ever push child B. - BinnedBuildNode(usePongBuffer, subtreeRegionStartIndex, nodePushContext->NodeIndex, subtreeCount, nodePushContext->ParentNodeIndex, 1, nodePushContext->CentroidBounds, nodePushContext->Context, workerIndex, dispatcher); + ref var bounds = ref boundingBoxes[i]; + boundsB.Min = Vector4.Min(bounds.Min, boundsB.Min); + boundsB.Max = Vector4.Max(bounds.Max, boundsB.Max); + leafCountB += subtrees[i].LeafCount; } + Debug.Assert(parentNodeIndex < 0 || Unsafe.Add(ref context->Nodes[parentNodeIndex].A, childIndexInParent).LeafCount == leafCountA + leafCountB); + //Note that we just use the bounds as centroid bounds. This is a degenerate situation anyway. + BuildNode(boundsA, boundsB, leafCountA, leafCountB, subtrees, nodes, metanodes, nodeIndex, parentNodeIndex, childIndexInParent, subtreeCountA, subtreeCountB, ref context->Leaves, out aIndex, out bIndex); + } - private static unsafe void BuildNodeForDegeneracy( - Buffer subtrees, Buffer nodes, Buffer metanodes, int nodeIndex, int parentNodeIndex, int childIndexInParent, - Context* context, out int subtreeCountA, out int subtreeCountB, out int aIndex, out int bIndex) - where TLeaves : unmanaged - where TThreading : unmanaged, IBinnedBuilderThreading - { - //This shouldn't happen unless something is badly wrong with the input; no point in optimizing it. - subtreeCountA = subtrees.Length / 2; - subtreeCountB = subtrees.Length - subtreeCountA; - BoundingBox4 boundsA, boundsB; - boundsA.Min = new Vector4(float.MaxValue); - boundsA.Max = new Vector4(float.MinValue); - boundsB.Min = new Vector4(float.MaxValue); - boundsB.Max = new Vector4(float.MinValue); - int leafCountA = 0, leafCountB = 0; - var boundingBoxes = subtrees.As(); - for (int i = 0; i < subtreeCountA; ++i) - { - ref var bounds = ref boundingBoxes[i]; - boundsA.Min = Vector4.Min(bounds.Min, boundsA.Min); - boundsA.Max = Vector4.Max(bounds.Max, boundsA.Max); - leafCountA += subtrees[i].LeafCount; - } - for (int i = subtreeCountA; i < subtrees.Length; ++i) - { - ref var bounds = ref boundingBoxes[i]; - boundsB.Min = Vector4.Min(bounds.Min, boundsB.Min); - boundsB.Max = Vector4.Max(bounds.Max, boundsB.Max); - leafCountB += subtrees[i].LeafCount; - } - Debug.Assert(parentNodeIndex < 0 || Unsafe.Add(ref context->Nodes[parentNodeIndex].A, childIndexInParent).LeafCount == leafCountA + leafCountB); - //Note that we just use the bounds as centroid bounds. This is a degenerate situation anyway. - BuildNode(boundsA, boundsB, leafCountA, leafCountB, subtrees, nodes, metanodes, nodeIndex, parentNodeIndex, childIndexInParent, subtreeCountA, subtreeCountB, ref context->Leaves, out aIndex, out bIndex); - } + // Note that degenerate nodes are those which are assumed to have zero-sized centroid bound spans. + // There are other possibilities--NaNs, infinities--which also flow into this, but the usual case is overlapping geometry. + // We pass this along recursively to trigger the degeneracy case at each level. + static unsafe void HandleDegeneracy(Buffer subtrees, Buffer boundingBoxes, Buffer nodes, Buffer metanodes, + bool usePongBuffer, int subtreeRegionStartIndex, int nodeIndex, int subtreeCount, int parentNodeIndex, int childIndexInParent, + BoundingBox4 centroidBounds, Context* context, int workerIndex, IThreadDispatcher dispatcher) + where TLeaves : unmanaged where TThreading : unmanaged, IBinnedBuilderThreading + { - // Note that degenerate nodes are those which are assumed to have zero-sized centroid bound spans. - // There are other possibilities--NaNs, infinities--which also flow into this, but the usual case is overlapping geometry. - // We pass this along recursively to trigger the degeneracy case at each level. - static unsafe void HandleDegeneracy(Buffer subtrees, Buffer boundingBoxes, Buffer nodes, Buffer metanodes, - bool usePongBuffer, int subtreeRegionStartIndex, int nodeIndex, int subtreeCount, int parentNodeIndex, int childIndexInParent, - BoundingBox4 centroidBounds, Context* context, int workerIndex, IThreadDispatcher dispatcher) - where TLeaves : unmanaged where TThreading : unmanaged, IBinnedBuilderThreading - { + BuildNodeForDegeneracy(subtrees, nodes, metanodes, nodeIndex, parentNodeIndex, childIndexInParent, context, out var subtreeCountA, out var subtreeCountB, out var aIndex, out var bIndex); + if (subtreeCountA > 1) + BinnedBuildNode(usePongBuffer, subtreeRegionStartIndex, aIndex, subtreeCountA, nodeIndex, 0, centroidBounds, context, workerIndex, dispatcher); + if (subtreeCountB > 1) + BinnedBuildNode(usePongBuffer, subtreeRegionStartIndex + subtreeCountA, bIndex, subtreeCountB, nodeIndex, 1, centroidBounds, context, workerIndex, dispatcher); + } - BuildNodeForDegeneracy(subtrees, nodes, metanodes, nodeIndex, parentNodeIndex, childIndexInParent, context, out var subtreeCountA, out var subtreeCountB, out var aIndex, out var bIndex); - if (subtreeCountA > 1) - BinnedBuildNode(usePongBuffer, subtreeRegionStartIndex, aIndex, subtreeCountA, nodeIndex, 0, centroidBounds, context, workerIndex, dispatcher); - if (subtreeCountB > 1) - BinnedBuildNode(usePongBuffer, subtreeRegionStartIndex + subtreeCountA, bIndex, subtreeCountB, nodeIndex, 1, centroidBounds, context, workerIndex, dispatcher); - } + + static unsafe void HandleMicrosweepDegeneracy(ref TLeaves leaves, + Buffer subtrees, Buffer nodes, Buffer metanodes, int nodeIndex, int parentNodeIndex, int childIndexInParent, Vector4 centroidMin, Vector4 centroidMax, Context* context, int workerIndex) + where TLeaves : unmanaged where TThreading : unmanaged, IBinnedBuilderThreading + { + BuildNodeForDegeneracy(subtrees, nodes, metanodes, nodeIndex, parentNodeIndex, childIndexInParent, context, out var subtreeCountA, out var subtreeCountB, out var aIndex, out var bIndex); + if (subtreeCountA > 1) + MicroSweepForBinnedBuilder(centroidMin, centroidMax, ref leaves, subtrees.Slice(subtreeCountA), nodes.Slice(1, subtreeCountA - 1), metanodes.Allocated ? metanodes.Slice(1, subtreeCountA - 1) : metanodes, aIndex, nodeIndex, 0, context, workerIndex); + if (subtreeCountB > 1) + MicroSweepForBinnedBuilder(centroidMin, centroidMax, ref leaves, subtrees.Slice(subtreeCountA, subtreeCountB), nodes.Slice(subtreeCountA, subtreeCountB - 1), metanodes.Allocated ? metanodes.Slice(subtreeCountA, subtreeCountB - 1) : metanodes, bIndex, nodeIndex, 1, context, workerIndex); + } - static unsafe void HandleMicrosweepDegeneracy(ref TLeaves leaves, - Buffer subtrees, Buffer nodes, Buffer metanodes, int nodeIndex, int parentNodeIndex, int childIndexInParent, Vector4 centroidMin, Vector4 centroidMax, Context* context, int workerIndex) - where TLeaves : unmanaged where TThreading : unmanaged, IBinnedBuilderThreading + static unsafe void BinnedBuildNode( + bool usePongBuffer, int subtreeRegionStartIndex, int nodeIndex, int subtreeCount, int parentNodeIndex, int childIndexInParent, + BoundingBox4 centroidBounds, Context* context, int workerIndex, IThreadDispatcher dispatcher) + where TLeaves : unmanaged where TThreading : unmanaged, IBinnedBuilderThreading + { + var subtrees = (usePongBuffer ? context->SubtreesPong : context->SubtreesPing).Slice(subtreeRegionStartIndex, subtreeCount); + var subtreeBinIndices = context->BinIndices.Allocated ? context->BinIndices.Slice(subtreeRegionStartIndex, subtreeCount) : default; + //leaf counts, indices, and bounds are packed together, but it's useful to have a bounds-only representation so that the merging processes don't have to worry about dealing with the fourth lanes. + var boundingBoxes = subtrees.As(); + var nodeCount = subtreeCount - 1; + var nodes = context->Nodes.Slice(nodeIndex, nodeCount); + var metanodes = context->Metanodes.Allocated ? context->Metanodes.Slice(nodeIndex, nodeCount) : context->Metanodes; + if (subtreeCount == 2) { - BuildNodeForDegeneracy(subtrees, nodes, metanodes, nodeIndex, parentNodeIndex, childIndexInParent, context, out var subtreeCountA, out var subtreeCountB, out var aIndex, out var bIndex); - if (subtreeCountA > 1) - MicroSweepForBinnedBuilder(centroidMin, centroidMax, ref leaves, subtrees.Slice(subtreeCountA), nodes.Slice(1, subtreeCountA - 1), metanodes.Allocated ? metanodes.Slice(1, subtreeCountA - 1) : metanodes, aIndex, nodeIndex, 0, context, workerIndex); - if (subtreeCountB > 1) - MicroSweepForBinnedBuilder(centroidMin, centroidMax, ref leaves, subtrees.Slice(subtreeCountA, subtreeCountB), nodes.Slice(subtreeCountA, subtreeCountB - 1), metanodes.Allocated ? metanodes.Slice(subtreeCountA, subtreeCountB - 1) : metanodes, bIndex, nodeIndex, 1, context, workerIndex); + Debug.Assert(parentNodeIndex < 0 || Unsafe.Add(ref context->Nodes[parentNodeIndex].A, childIndexInParent).LeafCount == subtrees[0].LeafCount + subtrees[1].LeafCount); + BuildNode(boundingBoxes[0], boundingBoxes[1], subtrees[0].LeafCount, subtrees[1].LeafCount, subtrees, nodes, metanodes, nodeIndex, parentNodeIndex, childIndexInParent, 1, 1, ref context->Leaves, out _, out _); + return; } - - - static unsafe void BinnedBuildNode( - bool usePongBuffer, int subtreeRegionStartIndex, int nodeIndex, int subtreeCount, int parentNodeIndex, int childIndexInParent, - BoundingBox4 centroidBounds, Context* context, int workerIndex, IThreadDispatcher dispatcher) - where TLeaves : unmanaged where TThreading : unmanaged, IBinnedBuilderThreading + var targetTaskCount = typeof(TThreading) == typeof(SingleThreaded) ? 1 : + ((MultithreadBinnedBuildContext*)Unsafe.AsPointer(ref context->Threading))->GetTargetTaskCountForInnerLoop(subtreeCount); + if (nodeIndex == 0) { - var subtrees = (usePongBuffer ? context->SubtreesPong : context->SubtreesPing).Slice(subtreeRegionStartIndex, subtreeCount); - var subtreeBinIndices = context->BinIndices.Allocated ? context->BinIndices.Slice(subtreeRegionStartIndex, subtreeCount) : default; - //leaf counts, indices, and bounds are packed together, but it's useful to have a bounds-only representation so that the merging processes don't have to worry about dealing with the fourth lanes. - var boundingBoxes = subtrees.As(); - var nodeCount = subtreeCount - 1; - var nodes = context->Nodes.Slice(nodeIndex, nodeCount); - var metanodes = context->Metanodes.Allocated ? context->Metanodes.Slice(nodeIndex, nodeCount) : context->Metanodes; - if (subtreeCount == 2) - { - Debug.Assert(parentNodeIndex < 0 || Unsafe.Add(ref context->Nodes[parentNodeIndex].A, childIndexInParent).LeafCount == subtrees[0].LeafCount + subtrees[1].LeafCount); - BuildNode(boundingBoxes[0], boundingBoxes[1], subtrees[0].LeafCount, subtrees[1].LeafCount, subtrees, nodes, metanodes, nodeIndex, parentNodeIndex, childIndexInParent, 1, 1, ref context->Leaves, out _, out _); - return; - } - var targetTaskCount = typeof(TThreading) == typeof(SingleThreaded) ? 1 : - ((MultithreadBinnedBuildContext*)Unsafe.AsPointer(ref context->Threading))->GetTargetTaskCountForInnerLoop(subtreeCount); - if (nodeIndex == 0) + //The first node doesn't have a parent, and so isn't given centroid bounds. We have to compute them. + var useST = true; + if (typeof(TThreading) != typeof(SingleThreaded)) { - //The first node doesn't have a parent, and so isn't given centroid bounds. We have to compute them. - var useST = true; - if (typeof(TThreading) != typeof(SingleThreaded)) - { - var mtContext = (MultithreadBinnedBuildContext*)Unsafe.AsPointer(ref context->Threading); - var taskData = new SharedTaskData(mtContext->Workers.Length, 0, subtrees.Length, MinimumSubtreesPerThreadForCentroidPrepass, mtContext->GetTargetTaskCountForInnerLoop(subtreeCount)); - if (taskData.TaskCount > 1) - { - centroidBounds = MultithreadedCentroidPrepass( - mtContext, boundingBoxes, taskData, workerIndex, dispatcher); - useST = false; - } - } - if (useST) + var mtContext = (MultithreadBinnedBuildContext*)Unsafe.AsPointer(ref context->Threading); + var taskData = new SharedTaskData(mtContext->Workers.Length, 0, subtrees.Length, MinimumSubtreesPerThreadForCentroidPrepass, mtContext->GetTargetTaskCountForInnerLoop(subtreeCount)); + if (taskData.TaskCount > 1) { - centroidBounds = ComputeCentroidBounds(boundingBoxes); + centroidBounds = MultithreadedCentroidPrepass( + mtContext, boundingBoxes, taskData, workerIndex, dispatcher); + useST = false; } } - var centroidSpan = centroidBounds.Max - centroidBounds.Min; - var axisIsDegenerate = Vector128.LessThanOrEqual(centroidSpan.AsVector128(), Vector128.Create(1e-12f)); - if ((Vector128.ExtractMostSignificantBits(axisIsDegenerate) & 0b111) == 0b111) + if (useST) { - //This node is completely degenerate; there is no 'good' ordering of the children. Pick a split in the middle and shrug. - //This shouldn't happen unless something is badly wrong with the input; no point in optimizing it. - HandleDegeneracy(subtrees, boundingBoxes, nodes, metanodes, usePongBuffer, subtreeRegionStartIndex, nodeIndex, subtreeCount, parentNodeIndex, childIndexInParent, centroidBounds, context, workerIndex, dispatcher); - return; + centroidBounds = ComputeCentroidBounds(boundingBoxes); } + } + var centroidSpan = centroidBounds.Max - centroidBounds.Min; + var axisIsDegenerate = Vector128.LessThanOrEqual(centroidSpan.AsVector128(), Vector128.Create(1e-12f)); + if ((Vector128.ExtractMostSignificantBits(axisIsDegenerate) & 0b111) == 0b111) + { + //This node is completely degenerate; there is no 'good' ordering of the children. Pick a split in the middle and shrug. + //This shouldn't happen unless something is badly wrong with the input; no point in optimizing it. + HandleDegeneracy(subtrees, boundingBoxes, nodes, metanodes, usePongBuffer, subtreeRegionStartIndex, nodeIndex, subtreeCount, parentNodeIndex, childIndexInParent, centroidBounds, context, workerIndex, dispatcher); + return; + } - //Note that we don't bother even trying to internally multithread microsweeps. They *should* be small, and should only show up deeper in the recursion process. - if (subtreeCount <= context->MicrosweepThreshold) - { - MicroSweepForBinnedBuilder(centroidBounds.Min, centroidBounds.Max, ref context->Leaves, subtrees, nodes, metanodes, nodeIndex, parentNodeIndex, childIndexInParent, context, workerIndex); - return; - } + //Note that we don't bother even trying to internally multithread microsweeps. They *should* be small, and should only show up deeper in the recursion process. + if (subtreeCount <= context->MicrosweepThreshold) + { + MicroSweepForBinnedBuilder(centroidBounds.Min, centroidBounds.Max, ref context->Leaves, subtrees, nodes, metanodes, nodeIndex, parentNodeIndex, childIndexInParent, context, workerIndex); + return; + } - var useX = centroidSpan.X > centroidSpan.Y && centroidSpan.X > centroidSpan.Z; - var useY = centroidSpan.Y > centroidSpan.Z; - //These will be used conditionally based on what hardware acceleration is available. Pretty minor detail. - var permuteMask = Vector128.Create(useX ? 0 : useY ? 1 : 2, 0, 0, 0); - var axisIndex = useX ? 0 : useY ? 1 : 2; + var useX = centroidSpan.X > centroidSpan.Y && centroidSpan.X > centroidSpan.Z; + var useY = centroidSpan.Y > centroidSpan.Z; + //These will be used conditionally based on what hardware acceleration is available. Pretty minor detail. + var permuteMask = Vector128.Create(useX ? 0 : useY ? 1 : 2, 0, 0, 0); + var axisIndex = useX ? 0 : useY ? 1 : 2; - var binCount = int.Min(context->MaximumBinCount, int.Max((int)(subtreeCount * context->LeafToBinMultiplier), context->MinimumBinCount)); + var binCount = int.Min(context->MaximumBinCount, int.Max((int)(subtreeCount * context->LeafToBinMultiplier), context->MinimumBinCount)); - var offsetToBinIndex = new Vector4(binCount) / centroidSpan; - //Avoid letting NaNs into the offsetToBinIndex scale. - offsetToBinIndex = Vector128.ConditionalSelect(axisIsDegenerate, Vector128.Zero, offsetToBinIndex.AsVector128()).AsVector4(); + var offsetToBinIndex = new Vector4(binCount) / centroidSpan; + //Avoid letting NaNs into the offsetToBinIndex scale. + offsetToBinIndex = Vector128.ConditionalSelect(axisIsDegenerate, Vector128.Zero, offsetToBinIndex.AsVector128()).AsVector4(); - var maximumBinIndex = new Vector4(binCount - 1); - context->Threading.GetBins(workerIndex, out var binBoundingBoxes, out var binCentroidBoundingBoxes, out var binBoundingBoxesScan, out var binCentroidBoundingBoxesScan, out var binLeafCounts); - Debug.Assert(binBoundingBoxes.Length >= binCount); - for (int i = 0; i < binCount; ++i) + var maximumBinIndex = new Vector4(binCount - 1); + context->Threading.GetBins(workerIndex, out var binBoundingBoxes, out var binCentroidBoundingBoxes, out var binBoundingBoxesScan, out var binCentroidBoundingBoxesScan, out var binLeafCounts); + Debug.Assert(binBoundingBoxes.Length >= binCount); + for (int i = 0; i < binCount; ++i) + { + ref var binBounds = ref binBoundingBoxes[i]; + binBounds.Min = new Vector4(float.MaxValue); + binBounds.Max = new Vector4(float.MinValue); + ref var binCentroidBounds = ref binCentroidBoundingBoxes[i]; + binCentroidBounds.Min = new Vector4(float.MaxValue); + binCentroidBounds.Max = new Vector4(float.MinValue); + binLeafCounts[i] = 0; + } + var useSTForBinning = true; + if (typeof(TThreading) != typeof(SingleThreaded)) + { + var mtContext = (MultithreadBinnedBuildContext*)Unsafe.AsPointer(ref context->Threading); + var taskData = new SharedTaskData(mtContext->Workers.Length, 0, subtrees.Length, MinimumSubtreesPerThreadForBinning, mtContext->GetTargetTaskCountForInnerLoop(subtreeCount)); + if (taskData.TaskCount > 1) { - ref var binBounds = ref binBoundingBoxes[i]; - binBounds.Min = new Vector4(float.MaxValue); - binBounds.Max = new Vector4(float.MinValue); - ref var binCentroidBounds = ref binCentroidBoundingBoxes[i]; - binCentroidBounds.Min = new Vector4(float.MaxValue); - binCentroidBounds.Max = new Vector4(float.MinValue); - binLeafCounts[i] = 0; + MultithreadedBinSubtrees( + (MultithreadBinnedBuildContext*)Unsafe.AsPointer(ref context->Threading), + centroidBounds.Min, useX, useY, permuteMask, axisIndex, offsetToBinIndex, maximumBinIndex, subtrees, subtreeBinIndices, binCount, taskData, workerIndex, dispatcher); + useSTForBinning = false; } - var useSTForBinning = true; - if (typeof(TThreading) != typeof(SingleThreaded)) + } + if (useSTForBinning) + { + //If the subtree bin indices buffer isn't available, then the binning process can't write to them! That'll happen if: + //single threaded execution, + //no bufferpool provided, + //tree size too large for stack allocation. + if (subtreeBinIndices.Allocated) + BinSubtrees(centroidBounds.Min, useX, useY, permuteMask, axisIndex, offsetToBinIndex, maximumBinIndex, subtrees, binBoundingBoxes, binCentroidBoundingBoxes, binLeafCounts, subtreeBinIndices); + else + BinSubtrees(centroidBounds.Min, useX, useY, permuteMask, axisIndex, offsetToBinIndex, maximumBinIndex, subtrees, binBoundingBoxes, binCentroidBoundingBoxes, binLeafCounts, subtreeBinIndices); + } + //Identify the split index by examining the SAH of very split option. + //Premerge from left to right so we have a sorta-summed area table to cheaply look up all possible child A bounds as we scan. + binBoundingBoxesScan[0] = binBoundingBoxes[0]; + binCentroidBoundingBoxesScan[0] = binCentroidBoundingBoxes[0]; + int totalLeafCount = binLeafCounts[0]; + for (int i = 1; i < binCount; ++i) + { + var previousIndex = i - 1; + ref var bounds = ref binBoundingBoxes[i]; + ref var scanBounds = ref binBoundingBoxesScan[i]; + ref var previousScanBounds = ref binBoundingBoxesScan[previousIndex]; + scanBounds.Min = Vector4.Min(bounds.Min, previousScanBounds.Min); + scanBounds.Max = Vector4.Max(bounds.Max, previousScanBounds.Max); + ref var binCentroidBoundingBox = ref binCentroidBoundingBoxes[i]; + ref var binCentroidBoundingBoxScan = ref binCentroidBoundingBoxesScan[i]; + ref var previousCentroidBoundingBoxScan = ref binCentroidBoundingBoxesScan[previousIndex]; + binCentroidBoundingBoxScan.Min = Vector4.Min(binCentroidBoundingBox.Min, previousCentroidBoundingBoxScan.Min); + binCentroidBoundingBoxScan.Max = Vector4.Max(binCentroidBoundingBox.Max, previousCentroidBoundingBoxScan.Max); + totalLeafCount += binLeafCounts[i]; + } + var leftBoundsX = binBoundingBoxes[0]; + Debug.Assert( + leftBoundsX.Min.X > float.MinValue && leftBoundsX.Min.Y > float.MinValue && leftBoundsX.Min.Z > float.MinValue, + "Bin 0 should have been updated in all cases because it is aligned with the minimum bin, and the centroid span isn't degenerate."); + + float bestSAH = float.MaxValue; + int splitIndex = 1; + //The split index is going to end up in child B. + var lastBinIndex = binCount - 1; + var accumulatedBoundingBoxB = binBoundingBoxes[lastBinIndex]; + var accumulatedCentroidBoundingBoxB = binCentroidBoundingBoxes[lastBinIndex]; + BoundingBox4 bestBoundingBoxB, bestCentroidBoundingBoxB; + bestBoundingBoxB = accumulatedBoundingBoxB; + bestCentroidBoundingBoxB = accumulatedCentroidBoundingBoxB; + int accumulatedLeafCountB = binLeafCounts[lastBinIndex]; + int bestLeafCountB = 0; + for (int splitIndexCandidate = lastBinIndex; splitIndexCandidate >= 1; --splitIndexCandidate) + { + var previousIndex = splitIndexCandidate - 1; + var sahCandidate = ComputeBoundsMetric(binBoundingBoxesScan[previousIndex]) * (totalLeafCount - accumulatedLeafCountB) + ComputeBoundsMetric(accumulatedBoundingBoxB) * accumulatedLeafCountB; + if (sahCandidate < bestSAH) + { + bestSAH = sahCandidate; + splitIndex = splitIndexCandidate; + bestBoundingBoxB = accumulatedBoundingBoxB; + bestLeafCountB = accumulatedLeafCountB; + bestCentroidBoundingBoxB = accumulatedCentroidBoundingBoxB; + } + ref var bounds = ref binBoundingBoxes[previousIndex]; + accumulatedBoundingBoxB.Min = Vector4.Min(bounds.Min, accumulatedBoundingBoxB.Min); + accumulatedBoundingBoxB.Max = Vector4.Max(bounds.Max, accumulatedBoundingBoxB.Max); + ref var centroidBoundsForBin = ref binCentroidBoundingBoxes[previousIndex]; + accumulatedCentroidBoundingBoxB.Min = Vector4.Min(centroidBoundsForBin.Min, accumulatedCentroidBoundingBoxB.Min); + accumulatedCentroidBoundingBoxB.Max = Vector4.Max(centroidBoundsForBin.Max, accumulatedCentroidBoundingBoxB.Max); + accumulatedLeafCountB += binLeafCounts[previousIndex]; + } + if (bestLeafCountB == 0 || bestLeafCountB == totalLeafCount || bestSAH == float.MaxValue || float.IsNaN(bestSAH) || float.IsInfinity(bestSAH)) + { + //Some form of major problem detected! Fall back to a degenerate split. + HandleDegeneracy(subtrees, boundingBoxes, nodes, metanodes, usePongBuffer, subtreeRegionStartIndex, nodeIndex, subtreeCount, parentNodeIndex, childIndexInParent, centroidBounds, context, workerIndex, dispatcher); + return; + } + + var subtreeCountB = 0; + var subtreeCountA = 0; + var bestBoundingBoxA = binBoundingBoxesScan[splitIndex - 1]; + var bestCentroidBoundingBoxA = binCentroidBoundingBoxesScan[splitIndex - 1]; + + //Split the indices/bounds into two halves for the children to operate on. + if (context->SubtreesPong.Allocated) + { + Debug.Assert(subtreeBinIndices.Allocated); + //If the current buffer is pong, then write to ping, and vice versa. + var subtreesNext = (usePongBuffer ? context->SubtreesPing : context->SubtreesPong).Slice(subtreeRegionStartIndex, subtreeCount); + + var useSTForPartitioning = true; + //TODO: Note that the current multithreaded partitioning implementation is nondeterministic. + //Because of microsweeps/terminal node ordering, this can result in nondeterministic tree topology. + //See https://github.com/bepu/bepuphysics2/issues/276 for more information (and how to improve this in the future if valuable). + //For now, if the user wants determinism, we just use the single threaded path for partitioning. + if (typeof(TThreading) != typeof(SingleThreaded) && !context->Deterministic) { var mtContext = (MultithreadBinnedBuildContext*)Unsafe.AsPointer(ref context->Threading); - var taskData = new SharedTaskData(mtContext->Workers.Length, 0, subtrees.Length, MinimumSubtreesPerThreadForBinning, mtContext->GetTargetTaskCountForInnerLoop(subtreeCount)); + var taskData = new SharedTaskData(mtContext->Workers.Length, 0, subtrees.Length, MinimumSubtreesPerThreadForPartitioning, mtContext->GetTargetTaskCountForInnerLoop(subtreeCount)); if (taskData.TaskCount > 1) { - MultithreadedBinSubtrees( + (subtreeCountA, subtreeCountB) = MultithreadedPartition( (MultithreadBinnedBuildContext*)Unsafe.AsPointer(ref context->Threading), - centroidBounds.Min, useX, useY, permuteMask, axisIndex, offsetToBinIndex, maximumBinIndex, subtrees, subtreeBinIndices, binCount, taskData, workerIndex, dispatcher); - useSTForBinning = false; + subtrees, subtreesNext, subtreeBinIndices, splitIndex, taskData, workerIndex, dispatcher); + useSTForPartitioning = false; } } - if (useSTForBinning) - { - //If the subtree bin indices buffer isn't available, then the binning process can't write to them! That'll happen if: - //single threaded execution, - //no bufferpool provided, - //tree size too large for stack allocation. - if (subtreeBinIndices.Allocated) - BinSubtrees(centroidBounds.Min, useX, useY, permuteMask, axisIndex, offsetToBinIndex, maximumBinIndex, subtrees, binBoundingBoxes, binCentroidBoundingBoxes, binLeafCounts, subtreeBinIndices); - else - BinSubtrees(centroidBounds.Min, useX, useY, permuteMask, axisIndex, offsetToBinIndex, maximumBinIndex, subtrees, binBoundingBoxes, binCentroidBoundingBoxes, binLeafCounts, subtreeBinIndices); - } - //Identify the split index by examining the SAH of very split option. - //Premerge from left to right so we have a sorta-summed area table to cheaply look up all possible child A bounds as we scan. - binBoundingBoxesScan[0] = binBoundingBoxes[0]; - binCentroidBoundingBoxesScan[0] = binCentroidBoundingBoxes[0]; - int totalLeafCount = binLeafCounts[0]; - for (int i = 1; i < binCount; ++i) - { - var previousIndex = i - 1; - ref var bounds = ref binBoundingBoxes[i]; - ref var scanBounds = ref binBoundingBoxesScan[i]; - ref var previousScanBounds = ref binBoundingBoxesScan[previousIndex]; - scanBounds.Min = Vector4.Min(bounds.Min, previousScanBounds.Min); - scanBounds.Max = Vector4.Max(bounds.Max, previousScanBounds.Max); - ref var binCentroidBoundingBox = ref binCentroidBoundingBoxes[i]; - ref var binCentroidBoundingBoxScan = ref binCentroidBoundingBoxesScan[i]; - ref var previousCentroidBoundingBoxScan = ref binCentroidBoundingBoxesScan[previousIndex]; - binCentroidBoundingBoxScan.Min = Vector4.Min(binCentroidBoundingBox.Min, previousCentroidBoundingBoxScan.Min); - binCentroidBoundingBoxScan.Max = Vector4.Max(binCentroidBoundingBox.Max, previousCentroidBoundingBoxScan.Max); - totalLeafCount += binLeafCounts[i]; - } - var leftBoundsX = binBoundingBoxes[0]; - Debug.Assert( - leftBoundsX.Min.X > float.MinValue && leftBoundsX.Min.Y > float.MinValue && leftBoundsX.Min.Z > float.MinValue, - "Bin 0 should have been updated in all cases because it is aligned with the minimum bin, and the centroid span isn't degenerate."); - - float bestSAH = float.MaxValue; - int splitIndex = 1; - //The split index is going to end up in child B. - var lastBinIndex = binCount - 1; - var accumulatedBoundingBoxB = binBoundingBoxes[lastBinIndex]; - var accumulatedCentroidBoundingBoxB = binCentroidBoundingBoxes[lastBinIndex]; - BoundingBox4 bestBoundingBoxB, bestCentroidBoundingBoxB; - bestBoundingBoxB = accumulatedBoundingBoxB; - bestCentroidBoundingBoxB = accumulatedCentroidBoundingBoxB; - int accumulatedLeafCountB = binLeafCounts[lastBinIndex]; - int bestLeafCountB = 0; - for (int splitIndexCandidate = lastBinIndex; splitIndexCandidate >= 1; --splitIndexCandidate) + if (useSTForPartitioning) { - var previousIndex = splitIndexCandidate - 1; - var sahCandidate = ComputeBoundsMetric(binBoundingBoxesScan[previousIndex]) * (totalLeafCount - accumulatedLeafCountB) + ComputeBoundsMetric(accumulatedBoundingBoxB) * accumulatedLeafCountB; - if (sahCandidate < bestSAH) + for (int i = 0; i < subtreeCount; ++i) { - bestSAH = sahCandidate; - splitIndex = splitIndexCandidate; - bestBoundingBoxB = accumulatedBoundingBoxB; - bestLeafCountB = accumulatedLeafCountB; - bestCentroidBoundingBoxB = accumulatedCentroidBoundingBoxB; + var targetIndex = subtreeBinIndices[i] >= splitIndex ? subtreeCount - ++subtreeCountB : subtreeCountA++; + subtreesNext[targetIndex] = subtrees[i]; } - ref var bounds = ref binBoundingBoxes[previousIndex]; - accumulatedBoundingBoxB.Min = Vector4.Min(bounds.Min, accumulatedBoundingBoxB.Min); - accumulatedBoundingBoxB.Max = Vector4.Max(bounds.Max, accumulatedBoundingBoxB.Max); - ref var centroidBoundsForBin = ref binCentroidBoundingBoxes[previousIndex]; - accumulatedCentroidBoundingBoxB.Min = Vector4.Min(centroidBoundsForBin.Min, accumulatedCentroidBoundingBoxB.Min); - accumulatedCentroidBoundingBoxB.Max = Vector4.Max(centroidBoundsForBin.Max, accumulatedCentroidBoundingBoxB.Max); - accumulatedLeafCountB += binLeafCounts[previousIndex]; - } - if (bestLeafCountB == 0 || bestLeafCountB == totalLeafCount || bestSAH == float.MaxValue || float.IsNaN(bestSAH) || float.IsInfinity(bestSAH)) - { - //Some form of major problem detected! Fall back to a degenerate split. - HandleDegeneracy(subtrees, boundingBoxes, nodes, metanodes, usePongBuffer, subtreeRegionStartIndex, nodeIndex, subtreeCount, parentNodeIndex, childIndexInParent, centroidBounds, context, workerIndex, dispatcher); - return; } - - var subtreeCountB = 0; - var subtreeCountA = 0; - var bestBoundingBoxA = binBoundingBoxesScan[splitIndex - 1]; - var bestCentroidBoundingBoxA = binCentroidBoundingBoxesScan[splitIndex - 1]; - - //Split the indices/bounds into two halves for the children to operate on. - if (context->SubtreesPong.Allocated) + subtrees = subtreesNext; + usePongBuffer = !usePongBuffer; + } + else + { + //There is no pong buffer allocated. We allow this for lower memory allocation, but the implementation is strictly sequential and slower. + while (subtreeCountA + subtreeCountB < subtreeCount) { - Debug.Assert(subtreeBinIndices.Allocated); - //If the current buffer is pong, then write to ping, and vice versa. - var subtreesNext = (usePongBuffer ? context->SubtreesPing : context->SubtreesPong).Slice(subtreeRegionStartIndex, subtreeCount); - - var useSTForPartitioning = true; - //TODO: Note that the current multithreaded partitioning implementation is nondeterministic. - //Because of microsweeps/terminal node ordering, this can result in nondeterministic tree topology. - //See https://github.com/bepu/bepuphysics2/issues/276 for more information (and how to improve this in the future if valuable). - //For now, if the user wants determinism, we just use the single threaded path for partitioning. - if (typeof(TThreading) != typeof(SingleThreaded) && !context->Deterministic) + ref var box = ref boundingBoxes[subtreeCountA]; + var binIndex = ComputeBinIndex(centroidBounds.Min, useX, useY, permuteMask, axisIndex, offsetToBinIndex, maximumBinIndex, box); + if (binIndex >= splitIndex) { - var mtContext = (MultithreadBinnedBuildContext*)Unsafe.AsPointer(ref context->Threading); - var taskData = new SharedTaskData(mtContext->Workers.Length, 0, subtrees.Length, MinimumSubtreesPerThreadForPartitioning, mtContext->GetTargetTaskCountForInnerLoop(subtreeCount)); - if (taskData.TaskCount > 1) + //Belongs to B. Swap it. + var targetIndex = subtreeCount - subtreeCountB - 1; + if (Vector256.IsHardwareAccelerated) { - (subtreeCountA, subtreeCountB) = MultithreadedPartition( - (MultithreadBinnedBuildContext*)Unsafe.AsPointer(ref context->Threading), - subtrees, subtreesNext, subtreeBinIndices, splitIndex, taskData, workerIndex, dispatcher); - useSTForPartitioning = false; + var targetMemory = (byte*)(subtrees.Memory + targetIndex); + var aCountMemory = (byte*)(subtrees.Memory + subtreeCountA); + var targetVector = Vector256.Load(targetMemory); + var aCountVector = Vector256.Load(aCountMemory); + Vector256.Store(aCountVector, targetMemory); + Vector256.Store(targetVector, aCountMemory); } - } - if (useSTForPartitioning) - { - for (int i = 0; i < subtreeCount; ++i) + else { - var targetIndex = subtreeBinIndices[i] >= splitIndex ? subtreeCount - ++subtreeCountB : subtreeCountA++; - subtreesNext[targetIndex] = subtrees[i]; + Helpers.Swap(ref subtrees[targetIndex], ref subtrees[subtreeCountA]); } + ++subtreeCountB; + //(Note that we still need to examine what we just swapped into the slot! It may belong to B too!) } - subtrees = subtreesNext; - usePongBuffer = !usePongBuffer; - } - else - { - //There is no pong buffer allocated. We allow this for lower memory allocation, but the implementation is strictly sequential and slower. - while (subtreeCountA + subtreeCountB < subtreeCount) + else { - ref var box = ref boundingBoxes[subtreeCountA]; - var binIndex = ComputeBinIndex(centroidBounds.Min, useX, useY, permuteMask, axisIndex, offsetToBinIndex, maximumBinIndex, box); - if (binIndex >= splitIndex) - { - //Belongs to B. Swap it. - var targetIndex = subtreeCount - subtreeCountB - 1; - if (Vector256.IsHardwareAccelerated) - { - var targetMemory = (byte*)(subtrees.Memory + targetIndex); - var aCountMemory = (byte*)(subtrees.Memory + subtreeCountA); - var targetVector = Vector256.Load(targetMemory); - var aCountVector = Vector256.Load(aCountMemory); - Vector256.Store(aCountVector, targetMemory); - Vector256.Store(targetVector, aCountMemory); - } - else - { - Helpers.Swap(ref subtrees[targetIndex], ref subtrees[subtreeCountA]); - } - ++subtreeCountB; - //(Note that we still need to examine what we just swapped into the slot! It may belong to B too!) - } - else - { - //Belongs to A, no movement necessary. - ++subtreeCountA; - } + //Belongs to A, no movement necessary. + ++subtreeCountA; } } - var leafCountB = bestLeafCountB; - var leafCountA = totalLeafCount - leafCountB; - Debug.Assert(subtreeCountA + subtreeCountB == subtreeCount); - Debug.Assert(parentNodeIndex < 0 || Unsafe.Add(ref context->Nodes[parentNodeIndex].A, childIndexInParent).LeafCount == leafCountA + leafCountB); - BuildNode(bestBoundingBoxA, bestBoundingBoxB, leafCountA, leafCountB, subtrees, nodes, metanodes, nodeIndex, parentNodeIndex, childIndexInParent, subtreeCountA, subtreeCountB, ref context->Leaves, out var nodeChildIndexA, out var nodeChildIndexB); - - var targetNodeTaskCount = typeof(TThreading) == typeof(SingleThreaded) ? 1 : - ((MultithreadBinnedBuildContext*)Unsafe.AsPointer(ref context->Threading))->GetTargetTaskCountForNodes(subtreeCount); - var shouldPushBOntoMultithreadedQueue = targetNodeTaskCount > 1 && subtreeCountA >= MinimumSubtreesPerThreadForNodeJob && subtreeCountB >= MinimumSubtreesPerThreadForNodeJob; - ContinuationHandle nodeBContinuation = default; - if (shouldPushBOntoMultithreadedQueue) + } + var leafCountB = bestLeafCountB; + var leafCountA = totalLeafCount - leafCountB; + Debug.Assert(subtreeCountA + subtreeCountB == subtreeCount); + Debug.Assert(parentNodeIndex < 0 || Unsafe.Add(ref context->Nodes[parentNodeIndex].A, childIndexInParent).LeafCount == leafCountA + leafCountB); + BuildNode(bestBoundingBoxA, bestBoundingBoxB, leafCountA, leafCountB, subtrees, nodes, metanodes, nodeIndex, parentNodeIndex, childIndexInParent, subtreeCountA, subtreeCountB, ref context->Leaves, out var nodeChildIndexA, out var nodeChildIndexB); + + var targetNodeTaskCount = typeof(TThreading) == typeof(SingleThreaded) ? 1 : + ((MultithreadBinnedBuildContext*)Unsafe.AsPointer(ref context->Threading))->GetTargetTaskCountForNodes(subtreeCount); + var shouldPushBOntoMultithreadedQueue = targetNodeTaskCount > 1 && subtreeCountA >= MinimumSubtreesPerThreadForNodeJob && subtreeCountB >= MinimumSubtreesPerThreadForNodeJob; + ContinuationHandle nodeBContinuation = default; + if (shouldPushBOntoMultithreadedQueue) + { + //Both of the children are large. Push child B onto the multithreaded execution stack so it can run at the same time as child A (potentially). + Debug.Assert(MinimumSubtreesPerThreadForNodeJob > 1, "The job threshold for a new node should be large enough that there's no need for a subtreeCountB > 1 test."); + ref var threading = ref Unsafe.As(ref context->Threading); + //Allocate the parameters to send to the worker on the local stack. Note that we have to preserve the stack for this to work; see the later WaitForCompletion. + NodePushTaskContext nodePushContext; + nodePushContext.Context = context; + nodePushContext.NodeIndex = nodeChildIndexB; + nodePushContext.ParentNodeIndex = nodeIndex; + nodePushContext.CentroidBounds = bestCentroidBoundingBoxB; + //Note that we use the task id to store subtree start, subtree count, and the pong buffer flag. Don't have to do that, but no reason not to use it. + Debug.Assert((uint)subtreeCountB < (1u << 31), "The task id encodes start, count, and a pong flag, so we don't have room for a full 32 bits of count."); + var task = new Task(&BinnedBuilderNodeWorker, &nodePushContext, (long)(subtreeRegionStartIndex + subtreeCountA) | ((long)subtreeCountB << 32) | (usePongBuffer ? 1L << 63 : 0)); + nodeBContinuation = threading.TaskStack->AllocateContinuationAndPush(new Span(&task, 1), workerIndex, dispatcher, 0); + } + if (subtreeCountA > 1) + BinnedBuildNode(usePongBuffer, subtreeRegionStartIndex, nodeChildIndexA, subtreeCountA, nodeIndex, 0, bestCentroidBoundingBoxA, context, workerIndex, dispatcher); + if (!shouldPushBOntoMultithreadedQueue && subtreeCountB > 1) + BinnedBuildNode(usePongBuffer, subtreeRegionStartIndex + subtreeCountA, nodeChildIndexB, subtreeCountB, nodeIndex, 1, bestCentroidBoundingBoxB, context, workerIndex, dispatcher); + if (shouldPushBOntoMultithreadedQueue) + { + //We want to keep the stack at this level alive until the memory we allocated for the node push completes. + //Note that WaitForCompletion will execute pending work; this isn't just busywaiting the current thread. + //In addition to letting us use the local stack to store some arguments for the other thread, this wait means that all children have completed when this function returns. + //That makes knowing when to stop the queue easier. + Debug.Assert(nodeBContinuation.Initialized); + Unsafe.As(ref context->Threading).TaskStack->WaitForCompletion(nodeBContinuation, workerIndex, dispatcher); + } + } + + /// + /// Runs a binned build across the input buffer. + /// + /// Subtrees (either leaves or nodes) to run the builder over. The builder may make in-place modifications to the input buffer; the input buffer should not be assumed to be in a valid state after the builder runs. + /// A parallel buffer to subtrees which is used as a scratch buffer during execution. If a default initialized buffer is provided, a slower sequential in-place fallback will be used. + /// Buffer holding the nodes created by the build process. + /// Nodes are created in a depth first ordering with respect to the input buffer. + /// Buffer holding the metanodes created by the build process. + /// Metanodes, like nodes, are created in a depth first ordering with respect to the input buffer. + /// Metanodes are in the same order and in the same slots; they simply contain data about nodes that most traversals don't need to know about. + /// Buffer holding the leaf references created by the build process. + /// The indices written by the build process are those defined in the inputs; any that is negative is encoded according to and points into the leaf buffer. + /// If a default-valued (unallocated) buffer is passed in, the binned builder will ignore leaves. + /// Buffer to be used for caching bin indices during execution. If subtreesPong is defined, binIndices must also be defined, and vice versa. + /// Thread dispatcher used to accelerate the build process. + /// Task stack being used to run the build process, if any. + /// If provided, the builder assumes the refinement is running within an existing multithreaded dispatch and will not call IThreadDispatcher.DispatchWorkers. + /// If null, the builder will create its own task stack and call IThreadDispatcher.DispatchWorkers internally. + /// Index of the current worker. + /// Number of workers that may be used in the builder. This should span all worker indices that may contribute to the build process even only a subset are expected to be used at any one time. + /// Number of tasks to try to use in the builder. + /// Buffer pool used to preallocate temporary resources for building. + /// Minimum number of bins the builder should use per node. + /// Maximum number of bins the builder should use per node. Must be no higher than 255. + /// Multiplier to apply to the subtree count within a node to decide the bin count. Resulting value will then be clamped by the minimum/maximum bin counts. + /// Threshold at or under which the binned builder resorts to local counting sort sweeps. + /// Whether to force determinism at a slightly higher cost when using internally multithreaded execution. + /// If the build is single threaded, it is already deterministic and this flag has no effect. + static unsafe void BinnedBuilderInternal(Buffer subtrees, Buffer subtreesPong, Buffer nodes, Buffer metanodes, Buffer leaves, Buffer binIndices, + IThreadDispatcher dispatcher, TaskStack* taskStackPointer, int workerIndex, int workerCount, int targetTaskCount, BufferPool pool, int minimumBinCount, int maximumBinCount, float leafToBinMultiplier, int microsweepThreshold, bool deterministic) + { + var subtreeCount = subtrees.Length; + if (nodes.Length < subtreeCount - 1) + throw new ArgumentException($"The nodes buffer is too small to hold all the nodes that will be necessary for the input subtrees."); + if (maximumBinCount > 255) + throw new ArgumentException($"Maximum bin count must fit in a byte (maximum of 255)."); + if (subtreesPong.Allocated != binIndices.Allocated) + throw new ArgumentException("The parameters subtreesPong and binIndices must both be allocated or unallocated."); + if (subtreeCount == 0) + return; + if (subtreeCount == 1) + { + //If there's only one leaf, the tree has a special format: the root node has only one child. + ref var root = ref nodes[0]; + root.A = subtrees[0]; + root.B = default; + return; + } + nodes = nodes.Slice(subtreeCount - 1); + + //Don't let the user pick values that will just cause an explosion. + Debug.Assert(minimumBinCount >= 2 && maximumBinCount >= 2, "At least two bins are required. In release mode, this will be clamped up to 2, but where did lower values come from?"); + minimumBinCount = int.Max(2, minimumBinCount); + maximumBinCount = int.Max(2, maximumBinCount); + //The microsweep uses the same resources as the bin allocations, so expand to hold whichever is larger. + var allocatedBinCount = int.Max(maximumBinCount, microsweepThreshold); + + + if (dispatcher == null && taskStackPointer == null) + { + //Use the single threaded path. + var allocatedByteCount = allocatedBinCount * 4 * sizeof(BoundingBox4) + allocatedBinCount * sizeof(int); + var binBoundsMemoryAllocation = stackalloc byte[allocatedByteCount + 32]; + //Should be basically irrelevant, but just in case it's not on some platform, align the allocation. + binBoundsMemoryAllocation = (byte*)(((ulong)binBoundsMemoryAllocation + 31ul) & (~31ul)); + var binBoundsMemory = new Buffer(binBoundsMemoryAllocation, allocatedByteCount); + + var threading = new SingleThreaded(binBoundsMemory, allocatedBinCount); + if (leaves.Allocated) { - //Both of the children are large. Push child B onto the multithreaded execution stack so it can run at the same time as child A (potentially). - Debug.Assert(MinimumSubtreesPerThreadForNodeJob > 1, "The job threshold for a new node should be large enough that there's no need for a subtreeCountB > 1 test."); - ref var threading = ref Unsafe.As(ref context->Threading); - //Allocate the parameters to send to the worker on the local stack. Note that we have to preserve the stack for this to work; see the later WaitForCompletion. - NodePushTaskContext nodePushContext; - nodePushContext.Context = context; - nodePushContext.NodeIndex = nodeChildIndexB; - nodePushContext.ParentNodeIndex = nodeIndex; - nodePushContext.CentroidBounds = bestCentroidBoundingBoxB; - //Note that we use the task id to store subtree start, subtree count, and the pong buffer flag. Don't have to do that, but no reason not to use it. - Debug.Assert((uint)subtreeCountB < (1u << 31), "The task id encodes start, count, and a pong flag, so we don't have room for a full 32 bits of count."); - var task = new Task(&BinnedBuilderNodeWorker, &nodePushContext, (long)(subtreeRegionStartIndex + subtreeCountA) | ((long)subtreeCountB << 32) | (usePongBuffer ? 1L << 63 : 0)); - nodeBContinuation = threading.TaskStack->AllocateContinuationAndPush(new Span(&task, 1), workerIndex, dispatcher, 0); + var context = new Context, SingleThreaded>( + minimumBinCount, maximumBinCount, leafToBinMultiplier, microsweepThreshold, deterministic, + subtrees, default, leaves, nodes, metanodes, binIndices, threading); + BinnedBuildNode(false, 0, 0, subtreeCount, -1, -1, default, &context, workerIndex, null); } - if (subtreeCountA > 1) - BinnedBuildNode(usePongBuffer, subtreeRegionStartIndex, nodeChildIndexA, subtreeCountA, nodeIndex, 0, bestCentroidBoundingBoxA, context, workerIndex, dispatcher); - if (!shouldPushBOntoMultithreadedQueue && subtreeCountB > 1) - BinnedBuildNode(usePongBuffer, subtreeRegionStartIndex + subtreeCountA, nodeChildIndexB, subtreeCountB, nodeIndex, 1, bestCentroidBoundingBoxB, context, workerIndex, dispatcher); - if (shouldPushBOntoMultithreadedQueue) + else { - //We want to keep the stack at this level alive until the memory we allocated for the node push completes. - //Note that WaitForCompletion will execute pending work; this isn't just busywaiting the current thread. - //In addition to letting us use the local stack to store some arguments for the other thread, this wait means that all children have completed when this function returns. - //That makes knowing when to stop the queue easier. - Debug.Assert(nodeBContinuation.Initialized); - Unsafe.As(ref context->Threading).TaskStack->WaitForCompletion(nodeBContinuation, workerIndex, dispatcher); + var context = new Context( + minimumBinCount, maximumBinCount, leafToBinMultiplier, microsweepThreshold, deterministic, + subtrees, default, default, nodes, metanodes, binIndices, threading); + BinnedBuildNode(false, 0, 0, subtreeCount, -1, -1, default, &context, workerIndex, null); } } - - /// - /// Runs a binned build across the input buffer. - /// - /// Subtrees (either leaves or nodes) to run the builder over. The builder may make in-place modifications to the input buffer; the input buffer should not be assumed to be in a valid state after the builder runs. - /// A parallel buffer to subtrees which is used as a scratch buffer during execution. If a default initialized buffer is provided, a slower sequential in-place fallback will be used. - /// Buffer holding the nodes created by the build process. - /// Nodes are created in a depth first ordering with respect to the input buffer. - /// Buffer holding the metanodes created by the build process. - /// Metanodes, like nodes, are created in a depth first ordering with respect to the input buffer. - /// Metanodes are in the same order and in the same slots; they simply contain data about nodes that most traversals don't need to know about. - /// Buffer holding the leaf references created by the build process. - /// The indices written by the build process are those defined in the inputs; any that is negative is encoded according to and points into the leaf buffer. - /// If a default-valued (unallocated) buffer is passed in, the binned builder will ignore leaves. - /// Buffer to be used for caching bin indices during execution. If subtreesPong is defined, binIndices must also be defined, and vice versa. - /// Thread dispatcher used to accelerate the build process. - /// Task stack being used to run the build process, if any. - /// If provided, the builder assumes the refinement is running within an existing multithreaded dispatch and will not call IThreadDispatcher.DispatchWorkers. - /// If null, the builder will create its own task stack and call IThreadDispatcher.DispatchWorkers internally. - /// Index of the current worker. - /// Number of workers that may be used in the builder. This should span all worker indices that may contribute to the build process even only a subset are expected to be used at any one time. - /// Number of tasks to try to use in the builder. - /// Buffer pool used to preallocate temporary resources for building. - /// Minimum number of bins the builder should use per node. - /// Maximum number of bins the builder should use per node. Must be no higher than 255. - /// Multiplier to apply to the subtree count within a node to decide the bin count. Resulting value will then be clamped by the minimum/maximum bin counts. - /// Threshold at or under which the binned builder resorts to local counting sort sweeps. - /// Whether to force determinism at a slightly higher cost when using internally multithreaded execution. - /// If the build is single threaded, it is already deterministic and this flag has no effect. - static unsafe void BinnedBuilderInternal(Buffer subtrees, Buffer subtreesPong, Buffer nodes, Buffer metanodes, Buffer leaves, Buffer binIndices, - IThreadDispatcher dispatcher, TaskStack* taskStackPointer, int workerIndex, int workerCount, int targetTaskCount, BufferPool pool, int minimumBinCount, int maximumBinCount, float leafToBinMultiplier, int microsweepThreshold, bool deterministic) + else { - var subtreeCount = subtrees.Length; - if (nodes.Length < subtreeCount - 1) - throw new ArgumentException($"The nodes buffer is too small to hold all the nodes that will be necessary for the input subtrees."); - if (maximumBinCount > 255) - throw new ArgumentException($"Maximum bin count must fit in a byte (maximum of 255)."); - if (subtreesPong.Allocated != binIndices.Allocated) - throw new ArgumentException("The parameters subtreesPong and binIndices must both be allocated or unallocated."); - if (subtreeCount == 0) - return; - if (subtreeCount == 1) + //Multithreaded dispatch! + //At the moment, the TaskStack expects an IThreadDispatcher to exist, so there are two cases to handle here: + //1: There is an IThreadDispatcher, but no existing TaskStack. We'll create one and do an internal dispatch. + //2: There is an IThreadDispatcher *and* an existing TaskStack. We'll push tasks into the stack, but we won't dispatch them; the user will. + Debug.Assert(dispatcher != null); + //While we could allocate on the stack with reasonable safety in the single threaded path, that's not very reasonable for the multithreaded path. + //Each worker thread could be given a node job which executes asynchronously with respect to other node jobs. + //Those node jobs could spawn multithreaded work that other workers assist with. + //Each of those jobs needs its own context for those workers, and the number of jobs is not 1:1 with the workers. + //We'll handle such dispatch-required allocations from worker pools. Here, we just preallocate stuff for the first level across all workers. + pool.Take(allocatedBinCount * workerCount * (sizeof(BoundingBox4) * 4 + sizeof(int)), out var workerBinsAllocation); + + BinnedBuildWorkerContext* workerContextsPointer = stackalloc BinnedBuildWorkerContext[workerCount]; + var workerContexts = new Buffer(workerContextsPointer, workerCount); + + int binAllocationStart = 0; + for (int i = 0; i < workerCount; ++i) { - //If there's only one leaf, the tree has a special format: the root node has only one child. - ref var root = ref nodes[0]; - root.A = subtrees[0]; - root.B = default; - return; + workerContexts[i] = new BinnedBuildWorkerContext(workerBinsAllocation, ref binAllocationStart, allocatedBinCount); } - nodes = nodes.Slice(subtreeCount - 1); - - //Don't let the user pick values that will just cause an explosion. - Debug.Assert(minimumBinCount >= 2 && maximumBinCount >= 2, "At least two bins are required. In release mode, this will be clamped up to 2, but where did lower values come from?"); - minimumBinCount = int.Max(2, minimumBinCount); - maximumBinCount = int.Max(2, maximumBinCount); - //The microsweep uses the same resources as the bin allocations, so expand to hold whichever is larger. - var allocatedBinCount = int.Max(maximumBinCount, microsweepThreshold); - - if (dispatcher == null && taskStackPointer == null) + TaskStack taskStack; + bool dispatchInternally = taskStackPointer == null; + if (dispatchInternally) + { + taskStack = new TaskStack(pool, dispatcher, workerCount); + taskStackPointer = &taskStack; + } + var threading = new MultithreadBinnedBuildContext { - //Use the single threaded path. - var allocatedByteCount = allocatedBinCount * 4 * sizeof(BoundingBox4) + allocatedBinCount * sizeof(int); - var binBoundsMemoryAllocation = stackalloc byte[allocatedByteCount + 32]; - //Should be basically irrelevant, but just in case it's not on some platform, align the allocation. - binBoundsMemoryAllocation = (byte*)(((ulong)binBoundsMemoryAllocation + 31ul) & (~31ul)); - var binBoundsMemory = new Buffer(binBoundsMemoryAllocation, allocatedByteCount); - - var threading = new SingleThreaded(binBoundsMemory, allocatedBinCount); - if (leaves.Allocated) + TopLevelTargetTaskCount = targetTaskCount, + OriginalSubtreeCount = subtrees.Length, + TaskStack = taskStackPointer, + Workers = workerContexts, + }; + if (leaves.Allocated) + { + var context = new Context, MultithreadBinnedBuildContext>( + minimumBinCount, maximumBinCount, leafToBinMultiplier, microsweepThreshold, deterministic, + subtrees, subtreesPong, leaves, nodes, metanodes, binIndices, threading); + + if (dispatchInternally) { - var context = new Context, SingleThreaded>( - minimumBinCount, maximumBinCount, leafToBinMultiplier, microsweepThreshold, deterministic, - subtrees, default, leaves, nodes, metanodes, binIndices, threading); - BinnedBuildNode(false, 0, 0, subtreeCount, -1, -1, default, &context, workerIndex, null); + Debug.Assert(workerIndex == 0, "If we're dispatching internally, there shouldn't be any other active workers."); + taskStackPointer->PushUnsafely(new Task(&BinnedBuilderWorkerEntry>, &context), 0, dispatcher); + TaskStack.DispatchWorkers(dispatcher, taskStackPointer, workerCount); } else { - var context = new Context( - minimumBinCount, maximumBinCount, leafToBinMultiplier, microsweepThreshold, deterministic, - subtrees, default, default, nodes, metanodes, binIndices, threading); - BinnedBuildNode(false, 0, 0, subtreeCount, -1, -1, default, &context, workerIndex, null); + BinnedBuildNode(false, 0, 0, context.SubtreesPing.Length, -1, -1, default, &context, workerIndex, dispatcher); } } else { - //Multithreaded dispatch! - //At the moment, the TaskStack expects an IThreadDispatcher to exist, so there are two cases to handle here: - //1: There is an IThreadDispatcher, but no existing TaskStack. We'll create one and do an internal dispatch. - //2: There is an IThreadDispatcher *and* an existing TaskStack. We'll push tasks into the stack, but we won't dispatch them; the user will. - Debug.Assert(dispatcher != null); - //While we could allocate on the stack with reasonable safety in the single threaded path, that's not very reasonable for the multithreaded path. - //Each worker thread could be given a node job which executes asynchronously with respect to other node jobs. - //Those node jobs could spawn multithreaded work that other workers assist with. - //Each of those jobs needs its own context for those workers, and the number of jobs is not 1:1 with the workers. - //We'll handle such dispatch-required allocations from worker pools. Here, we just preallocate stuff for the first level across all workers. - pool.Take(allocatedBinCount * workerCount * (sizeof(BoundingBox4) * 4 + sizeof(int)), out var workerBinsAllocation); - - BinnedBuildWorkerContext* workerContextsPointer = stackalloc BinnedBuildWorkerContext[workerCount]; - var workerContexts = new Buffer(workerContextsPointer, workerCount); - - int binAllocationStart = 0; - for (int i = 0; i < workerCount; ++i) - { - workerContexts[i] = new BinnedBuildWorkerContext(workerBinsAllocation, ref binAllocationStart, allocatedBinCount); - } + var context = new Context( + minimumBinCount, maximumBinCount, leafToBinMultiplier, microsweepThreshold, deterministic, + subtrees, subtreesPong, default, nodes, metanodes, binIndices, threading); - TaskStack taskStack; - bool dispatchInternally = taskStackPointer == null; if (dispatchInternally) { - taskStack = new TaskStack(pool, dispatcher, workerCount); - taskStackPointer = &taskStack; - } - var threading = new MultithreadBinnedBuildContext - { - TopLevelTargetTaskCount = targetTaskCount, - OriginalSubtreeCount = subtrees.Length, - TaskStack = taskStackPointer, - Workers = workerContexts, - }; - if (leaves.Allocated) - { - var context = new Context, MultithreadBinnedBuildContext>( - minimumBinCount, maximumBinCount, leafToBinMultiplier, microsweepThreshold, deterministic, - subtrees, subtreesPong, leaves, nodes, metanodes, binIndices, threading); - - if (dispatchInternally) - { - Debug.Assert(workerIndex == 0, "If we're dispatching internally, there shouldn't be any other active workers."); - taskStackPointer->PushUnsafely(new Task(&BinnedBuilderWorkerEntry>, &context), 0, dispatcher); - TaskStack.DispatchWorkers(dispatcher, taskStackPointer, workerCount); - } - else - { - BinnedBuildNode(false, 0, 0, context.SubtreesPing.Length, -1, -1, default, &context, workerIndex, dispatcher); - } + Debug.Assert(workerIndex == 0, "If we're dispatching internally, there shouldn't be any other active workers."); + taskStackPointer->PushUnsafely(new Task(&BinnedBuilderWorkerEntry, &context), 0, dispatcher); + TaskStack.DispatchWorkers(dispatcher, taskStackPointer, workerCount); } else { - var context = new Context( - minimumBinCount, maximumBinCount, leafToBinMultiplier, microsweepThreshold, deterministic, - subtrees, subtreesPong, default, nodes, metanodes, binIndices, threading); - - if (dispatchInternally) - { - Debug.Assert(workerIndex == 0, "If we're dispatching internally, there shouldn't be any other active workers."); - taskStackPointer->PushUnsafely(new Task(&BinnedBuilderWorkerEntry, &context), 0, dispatcher); - TaskStack.DispatchWorkers(dispatcher, taskStackPointer, workerCount); - } - else - { - BinnedBuildNode(false, 0, 0, context.SubtreesPing.Length, -1, -1, default, &context, workerIndex, dispatcher); - } + BinnedBuildNode(false, 0, 0, context.SubtreesPing.Length, -1, -1, default, &context, workerIndex, dispatcher); } + } - if (dispatchInternally) - taskStackPointer->Dispose(pool, dispatcher); - pool.Return(ref workerBinsAllocation); + if (dispatchInternally) + taskStackPointer->Dispose(pool, dispatcher); + pool.Return(ref workerBinsAllocation); - } } + } - unsafe static void BinnedBuilderWorkerEntry(long taskId, void* untypedContext, int workerIndex, IThreadDispatcher dispatcher) - where TLeaves : unmanaged - { - var context = (Context*)untypedContext; - BinnedBuildNode(false, 0, 0, context->SubtreesPing.Length, -1, -1, default, context, workerIndex, dispatcher); - //Once the entry point returns, all workers should stop because it won't return unless both nodes are done. - context->Threading.TaskStack->RequestStop(); - } + unsafe static void BinnedBuilderWorkerEntry(long taskId, void* untypedContext, int workerIndex, IThreadDispatcher dispatcher) + where TLeaves : unmanaged + { + var context = (Context*)untypedContext; + BinnedBuildNode(false, 0, 0, context->SubtreesPing.Length, -1, -1, default, context, workerIndex, dispatcher); + //Once the entry point returns, all workers should stop because it won't return unless both nodes are done. + context->Threading.TaskStack->RequestStop(); + } - /// - /// Runs a multithreaded binned build across the subtrees buffer. - /// - /// Subtrees (either leaves or nodes) to run the builder over. The builder may make in-place modifications to the input buffer; the input buffer should not be assumed to be in a valid state after the builder runs. - /// Buffer holding the nodes created by the build process. - /// Nodes are created in a depth first ordering with respect to the input buffer. - /// Buffer holding the metanodes created by the build process. - /// Metanodes, like nodes, are created in a depth first ordering with respect to the input buffer. - /// Metanodes are in the same order and in the same slots; they simply contain data about nodes that most traversals don't need to know about. - /// Buffer holding the leaf references created by the build process. - /// The indices written by the build process are those defined in the inputs; any that is negative is encoded according to and points into the leaf buffer. - /// If a default-valued (unallocated) buffer is passed in, the binned builder will ignore leaves. - /// Buffer pool used to preallocate a pingpong buffer if the number of subtrees exceeds maximumSubtreeStackAllocationCount. If null, stack allocation or a slower in-place partitioning will be used. - /// Dispatcher used to multithread the execution of the build. If the dispatcher is not null, pool must also not be null. - /// Task stack being used to run the build process, if any. - /// If provided, the builder assumes the refinement is running within an existing multithreaded dispatch and will not call IThreadDispatcher.DispatchWorkers. - /// If null, the builder will create its own task stack and call IThreadDispatcher.DispatchWorkers internally. - /// A pool must be provided if a thread dispatcher is given. - /// Index of the currently executing worker. If not running within a dispatch, 0 is valid. - /// Number of workers that may be used in the builder. This should span all worker indices that may contribute to the build process even only a subset are expected to be used at any one time. - /// If negative, the dispatcher's thread count will be used. - /// Number of tasks to try to use in the builder. If negative, the dispatcher's thread count will be used. - /// Maximum number of subtrees to try putting on the stack for the binned builder's pong buffers. - /// Subtree counts larger than this threshold will either resort to a buffer pool allocation (if available) or slower in-place partition operations. - /// Minimum number of bins the builder should use per node. - /// Maximum number of bins the builder should use per node. - /// Multiplier to apply to the subtree count within a node to decide the bin count. Resulting value will then be clamped by the minimum/maximum bin counts. - /// Threshold at or under which the binned builder resorts to local counting sort sweeps. - /// Whether to force determinism at a slightly higher cost when using internally multithreaded execution. - /// If the build is single threaded, it is already deterministic and this flag has no effect. - public static unsafe void BinnedBuild(Buffer subtrees, Buffer nodes, Buffer metanodes, Buffer leaves, - BufferPool pool = null, IThreadDispatcher dispatcher = null, TaskStack* taskStackPointer = null, int workerIndex = 0, int workerCount = -1, int targetTaskCount = -1, - int maximumSubtreeStackAllocationCount = 4096, int minimumBinCount = 16, int maximumBinCount = 64, float leafToBinMultiplier = 1 / 16f, int microsweepThreshold = 64, bool deterministic = false) + /// + /// Runs a multithreaded binned build across the subtrees buffer. + /// + /// Subtrees (either leaves or nodes) to run the builder over. The builder may make in-place modifications to the input buffer; the input buffer should not be assumed to be in a valid state after the builder runs. + /// Buffer holding the nodes created by the build process. + /// Nodes are created in a depth first ordering with respect to the input buffer. + /// Buffer holding the metanodes created by the build process. + /// Metanodes, like nodes, are created in a depth first ordering with respect to the input buffer. + /// Metanodes are in the same order and in the same slots; they simply contain data about nodes that most traversals don't need to know about. + /// Buffer holding the leaf references created by the build process. + /// The indices written by the build process are those defined in the inputs; any that is negative is encoded according to and points into the leaf buffer. + /// If a default-valued (unallocated) buffer is passed in, the binned builder will ignore leaves. + /// Buffer pool used to preallocate a pingpong buffer if the number of subtrees exceeds maximumSubtreeStackAllocationCount. If null, stack allocation or a slower in-place partitioning will be used. + /// Dispatcher used to multithread the execution of the build. If the dispatcher is not null, pool must also not be null. + /// Task stack being used to run the build process, if any. + /// If provided, the builder assumes the refinement is running within an existing multithreaded dispatch and will not call IThreadDispatcher.DispatchWorkers. + /// If null, the builder will create its own task stack and call IThreadDispatcher.DispatchWorkers internally. + /// A pool must be provided if a thread dispatcher is given. + /// Index of the currently executing worker. If not running within a dispatch, 0 is valid. + /// Number of workers that may be used in the builder. This should span all worker indices that may contribute to the build process even only a subset are expected to be used at any one time. + /// If negative, the dispatcher's thread count will be used. + /// Number of tasks to try to use in the builder. If negative, the dispatcher's thread count will be used. + /// Maximum number of subtrees to try putting on the stack for the binned builder's pong buffers. + /// Subtree counts larger than this threshold will either resort to a buffer pool allocation (if available) or slower in-place partition operations. + /// Minimum number of bins the builder should use per node. + /// Maximum number of bins the builder should use per node. + /// Multiplier to apply to the subtree count within a node to decide the bin count. Resulting value will then be clamped by the minimum/maximum bin counts. + /// Threshold at or under which the binned builder resorts to local counting sort sweeps. + /// Whether to force determinism at a slightly higher cost when using internally multithreaded execution. + /// If the build is single threaded, it is already deterministic and this flag has no effect. + public static unsafe void BinnedBuild(Buffer subtrees, Buffer nodes, Buffer metanodes, Buffer leaves, + BufferPool pool = null, IThreadDispatcher dispatcher = null, TaskStack* taskStackPointer = null, int workerIndex = 0, int workerCount = -1, int targetTaskCount = -1, + int maximumSubtreeStackAllocationCount = 4096, int minimumBinCount = 16, int maximumBinCount = 64, float leafToBinMultiplier = 1 / 16f, int microsweepThreshold = 64, bool deterministic = false) + { + if (subtrees.Length <= 2) { - if (subtrees.Length <= 2) - { - //No need to do anything fancy, all subtrees fit in the root. Requires a special case for the partial root. - nodes[0] = new Node { A = subtrees[0], B = subtrees.Length == 2 ? subtrees[1] : default }; - if (metanodes.Allocated) - metanodes[0] = new Metanode { Parent = -1, IndexInParent = -1 }; - return; - } - if (dispatcher != null && pool == null) - throw new ArgumentException("If a ThreadDispatcher has been given to BinnedBuild, a BufferPool must also be provided."); - Buffer subtreesPong; - Buffer binIndices; - bool requiresReturn = false; - if (subtrees.Length <= maximumSubtreeStackAllocationCount) - { - var subtreesPongMemory = stackalloc NodeChild[subtrees.Length]; - subtreesPong = new Buffer(subtreesPongMemory, subtrees.Length); - var binIndicesMemory = stackalloc byte[subtrees.Length]; - binIndices = new Buffer(binIndicesMemory, subtrees.Length); - } - else if (pool != null) - { - pool.Take(subtrees.Length, out subtreesPong); - pool.Take(subtrees.Length, out binIndices); - requiresReturn = true; - } - else - { - binIndices = default; - subtreesPong = default; - } - BinnedBuilderInternal(subtrees, subtreesPong, nodes, metanodes, leaves, binIndices, dispatcher, taskStackPointer, workerIndex, - dispatcher == null ? 0 : workerCount < 0 ? dispatcher.ThreadCount : workerCount, - dispatcher == null ? 0 : targetTaskCount < 0 ? dispatcher.ThreadCount : targetTaskCount, - pool, minimumBinCount, maximumBinCount, leafToBinMultiplier, microsweepThreshold, deterministic); - - if (requiresReturn) - { - pool.Return(ref binIndices); - pool.Return(ref subtreesPong); - } + //No need to do anything fancy, all subtrees fit in the root. Requires a special case for the partial root. + nodes[0] = new Node { A = subtrees[0], B = subtrees.Length == 2 ? subtrees[1] : default }; + if (metanodes.Allocated) + metanodes[0] = new Metanode { Parent = -1, IndexInParent = -1 }; + return; + } + if (dispatcher != null && pool == null) + throw new ArgumentException("If a ThreadDispatcher has been given to BinnedBuild, a BufferPool must also be provided."); + Buffer subtreesPong; + Buffer binIndices; + bool requiresReturn = false; + if (subtrees.Length <= maximumSubtreeStackAllocationCount) + { + var subtreesPongMemory = stackalloc NodeChild[subtrees.Length]; + subtreesPong = new Buffer(subtreesPongMemory, subtrees.Length); + var binIndicesMemory = stackalloc byte[subtrees.Length]; + binIndices = new Buffer(binIndicesMemory, subtrees.Length); } + else if (pool != null) + { + pool.Take(subtrees.Length, out subtreesPong); + pool.Take(subtrees.Length, out binIndices); + requiresReturn = true; + } + else + { + binIndices = default; + subtreesPong = default; + } + BinnedBuilderInternal(subtrees, subtreesPong, nodes, metanodes, leaves, binIndices, dispatcher, taskStackPointer, workerIndex, + dispatcher == null ? 0 : workerCount < 0 ? dispatcher.ThreadCount : workerCount, + dispatcher == null ? 0 : targetTaskCount < 0 ? dispatcher.ThreadCount : targetTaskCount, + pool, minimumBinCount, maximumBinCount, leafToBinMultiplier, microsweepThreshold, deterministic); - /// - /// Runs a binned build across the subtrees buffer. - /// - /// Subtrees (either leaves or nodes) to run the builder over. The builder may make in-place modifications to the input buffer; the input buffer should not be assumed to be in a valid state after the builder runs. - /// Buffer pool used to preallocate a pingpong buffer if the number of subtrees exceeds maximumSubtreeStackAllocationCount. If null, stack allocation or a slower in-place partitioning will be used. - /// A pool must be provided if a thread dispatcher is given. - /// Dispatcher used to multithread the execution of the build. If the dispatcher is not null, pool must also not be null. - /// Task stack being used to run the build process, if any. - /// If provided, the builder assumes the refinement is running within an existing multithreaded dispatch and will not call IThreadDispatcher.DispatchWorkers. - /// If null, the builder will create its own task stack and call IThreadDispatcher.DispatchWorkers internally. - /// Index of the currently executing worker. If not running within a dispatch, 0 is valid. - /// Number of workers that may be used in the builder. This should span all worker indices that may contribute to the build process even only a subset are expected to be used at any one time. - /// If negative, the dispatcher's thread count will be used. - /// Number of tasks to try to use in the builder. If negative, the dispatcher's thread count will be used. - /// Maximum number of subtrees to try putting on the stack for the binned builder's pong buffers. - /// Subtree counts larger than this threshold will either resort to a buffer pool allocation (if available) or slower in-place partition operations. - /// Minimum number of bins the builder should use per node. - /// Maximum number of bins the builder should use per node. - /// Multiplier to apply to the subtree count within a node to decide the bin count. Resulting value will then be clamped by the minimum/maximum bin counts. - /// Threshold at or under which the binned builder resorts to local counting sort sweeps. - /// Whether to force determinism at a slightly higher cost when using internally multithreaded execution. - /// If the build is single threaded, it is already deterministic and this flag has no effect. - public unsafe void BinnedBuild(Buffer subtrees, - BufferPool pool = null, IThreadDispatcher dispatcher = null, TaskStack* taskStackPointer = null, int workerIndex = 0, int workerCount = -1, int targetTaskCount = -1, - int maximumSubtreeStackAllocationCount = 4096, int minimumBinCount = 16, int maximumBinCount = 64, float leafToBinMultiplier = 1 / 16f, int microsweepThreshold = 64, bool deterministic = false) + if (requiresReturn) { - BinnedBuild(subtrees, Nodes.Slice(NodeCount), Metanodes.Slice(NodeCount), Leaves.Slice(LeafCount), pool, dispatcher, taskStackPointer, workerIndex, - workerCount, targetTaskCount, maximumSubtreeStackAllocationCount, minimumBinCount, maximumBinCount, leafToBinMultiplier, microsweepThreshold); + pool.Return(ref binIndices); + pool.Return(ref subtreesPong); } } + + /// + /// Runs a binned build across the subtrees buffer. + /// + /// Subtrees (either leaves or nodes) to run the builder over. The builder may make in-place modifications to the input buffer; the input buffer should not be assumed to be in a valid state after the builder runs. + /// Buffer pool used to preallocate a pingpong buffer if the number of subtrees exceeds maximumSubtreeStackAllocationCount. If null, stack allocation or a slower in-place partitioning will be used. + /// A pool must be provided if a thread dispatcher is given. + /// Dispatcher used to multithread the execution of the build. If the dispatcher is not null, pool must also not be null. + /// Task stack being used to run the build process, if any. + /// If provided, the builder assumes the refinement is running within an existing multithreaded dispatch and will not call IThreadDispatcher.DispatchWorkers. + /// If null, the builder will create its own task stack and call IThreadDispatcher.DispatchWorkers internally. + /// Index of the currently executing worker. If not running within a dispatch, 0 is valid. + /// Number of workers that may be used in the builder. This should span all worker indices that may contribute to the build process even only a subset are expected to be used at any one time. + /// If negative, the dispatcher's thread count will be used. + /// Number of tasks to try to use in the builder. If negative, the dispatcher's thread count will be used. + /// Maximum number of subtrees to try putting on the stack for the binned builder's pong buffers. + /// Subtree counts larger than this threshold will either resort to a buffer pool allocation (if available) or slower in-place partition operations. + /// Minimum number of bins the builder should use per node. + /// Maximum number of bins the builder should use per node. + /// Multiplier to apply to the subtree count within a node to decide the bin count. Resulting value will then be clamped by the minimum/maximum bin counts. + /// Threshold at or under which the binned builder resorts to local counting sort sweeps. + /// Whether to force determinism at a slightly higher cost when using internally multithreaded execution. + /// If the build is single threaded, it is already deterministic and this flag has no effect. + public unsafe void BinnedBuild(Buffer subtrees, + BufferPool pool = null, IThreadDispatcher dispatcher = null, TaskStack* taskStackPointer = null, int workerIndex = 0, int workerCount = -1, int targetTaskCount = -1, + int maximumSubtreeStackAllocationCount = 4096, int minimumBinCount = 16, int maximumBinCount = 64, float leafToBinMultiplier = 1 / 16f, int microsweepThreshold = 64, bool deterministic = false) + { + BinnedBuild(subtrees, Nodes.Slice(NodeCount), Metanodes.Slice(NodeCount), Leaves.Slice(LeafCount), pool, dispatcher, taskStackPointer, workerIndex, + workerCount, targetTaskCount, maximumSubtreeStackAllocationCount, minimumBinCount, maximumBinCount, leafToBinMultiplier, microsweepThreshold); + } }