Skip to content

Commit da79dc6

Browse files
authored
NonBacktracking inner matching loop optimizations (#70217)
* Inner matching loop optimizations * Cleanup and comments * Fix and generalize FixedLength handling in matcher * Fix FixedLength support again * Suggestions from code review * Change _stateInfo type to byte
1 parent 0712aeb commit da79dc6

File tree

5 files changed

+396
-185
lines changed

5 files changed

+396
-185
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/DfaMatchingState.cs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,6 @@ internal DfaMatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)
2323

2424
internal int Id { get; set; }
2525

26-
internal bool IsInitialState { get; set; }
27-
2826
/// <summary>This is a deadend state</summary>
2927
internal bool IsDeadend => Node.IsNothing;
3028

@@ -130,7 +128,7 @@ internal DfaMatchingState<TSet> Next(TSet minterm)
130128
}
131129

132130
[MethodImpl(MethodImplOptions.AggressiveInlining)]
133-
internal bool IsNullable(uint nextCharKind)
131+
internal bool IsNullableFor(uint nextCharKind)
134132
{
135133
Debug.Assert(nextCharKind is 0 or CharKind.BeginningEnd or CharKind.Newline or CharKind.WordLetter or CharKind.NewLineS);
136134
uint context = CharKind.Context(PrevCharKind, nextCharKind);

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexBuilder.cs

Lines changed: 71 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -114,13 +114,54 @@ internal sealed class SymbolicRegexBuilder<TSet> where TSet : IComparable<TSet>,
114114
/// </summary>
115115
internal DfaMatchingState<TSet>[]? _stateArray;
116116
internal DfaMatchingState<TSet>[]? _capturingStateArray;
117+
118+
/// <summary>
119+
/// Maps state IDs to context-independent information for all states in <see cref="_stateArray"/>.
120+
/// </summary>
121+
internal byte[]? _stateInfo;
122+
123+
// Bit masks for decoding elements of _stateInfo
124+
private const int isInitialMask = 0b0001;
125+
private const int isDeadendMask = 0b0010;
126+
private const int isNullableMask = 0b0100;
127+
private const int canBeNullableMask = 0b1000;
128+
129+
/// <summary>Assign the context-independent information for the given state.</summary>
130+
internal void SetStateInfo(int stateId, bool isInitial, bool isDeadend, bool isNullable, bool canBeNullable)
131+
{
132+
Debug.Assert(_stateInfo is not null);
133+
byte info = 0;
134+
if (isInitial)
135+
info |= isInitialMask;
136+
if (isDeadend)
137+
info |= isDeadendMask;
138+
if (isNullable)
139+
info |= isNullableMask;
140+
if (canBeNullable)
141+
info |= canBeNullableMask;
142+
_stateInfo[stateId] = info;
143+
}
144+
145+
/// <summary>Get context-independent information for the given state.</summary>
146+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
147+
internal (bool IsInitial, bool IsDeadend, bool IsNullable, bool CanBeNullable) GetStateInfo(int stateId)
148+
{
149+
Debug.Assert(_stateInfo is not null);
150+
byte info = _stateInfo[stateId];
151+
return (
152+
(info & isInitialMask) != 0,
153+
(info & isDeadendMask) != 0,
154+
(info & isNullableMask) != 0,
155+
(info & canBeNullableMask) != 0);
156+
}
157+
117158
/// <remarks>
118159
/// For these "delta" arrays, technically Volatile.Read should be used to read out an element,
119160
/// but in practice that's not needed on the runtimes in use (though that needs to be documented
120161
/// via https://github.com/dotnet/runtime/issues/63474), and use of Volatile.Read is
121162
/// contributing non-trivial overhead (https://github.com/dotnet/runtime/issues/65789).
122163
/// </remarks>
123-
internal DfaMatchingState<TSet>?[]? _delta;
164+
internal int[]? _delta;
124165
internal List<(DfaMatchingState<TSet>, DerivativeEffect[])>?[]? _capturingDelta;
125166
private const int InitialStateLimit = 1024;
126167

@@ -170,10 +211,11 @@ internal SymbolicRegexBuilder(ISolver<TSet> solver, CharSetSolver charSetSolver)
170211
{
171212
_stateArray = new DfaMatchingState<TSet>[InitialStateLimit];
172213
_capturingStateArray = new DfaMatchingState<TSet>[InitialStateLimit];
214+
_stateInfo = new byte[InitialStateLimit];
173215

174216
// the extra +1 slot with id minterms.Length is reserved for \Z (last occurrence of \n)
175217
_mintermsLog = BitOperations.Log2((uint)_minterms.Length) + 1;
176-
_delta = new DfaMatchingState<TSet>[InitialStateLimit << _mintermsLog];
218+
_delta = new int[InitialStateLimit << _mintermsLog];
177219
_capturingDelta = new List<(DfaMatchingState<TSet>, DerivativeEffect[])>[InitialStateLimit << _mintermsLog];
178220
}
179221

@@ -208,10 +250,10 @@ internal TSet GetMinterm(int mintermId)
208250
}
209251

210252
/// <summary>Returns the span from <see cref="_delta"/> that may contain transitions for the given state</summary>
211-
internal Span<DfaMatchingState<TSet>?> GetDeltasFor(DfaMatchingState<TSet> state)
253+
internal Span<int> GetDeltasFor(DfaMatchingState<TSet> state)
212254
{
213255
if (_delta is null || _minterms is null)
214-
return Span<DfaMatchingState<TSet>?>.Empty;
256+
return Span<int>.Empty;
215257
int numMinterms = state.StartsWithLineAnchor ? _minterms.Length + 1 : _minterms.Length;
216258
return _delta.AsSpan(state.Id << _mintermsLog, numMinterms);
217259
}
@@ -453,8 +495,9 @@ internal SymbolicRegexNode<TNewSet> Transform<TNewSet>(SymbolicRegexNode<TSet> n
453495
/// <param name="node">the pattern that this state will represent</param>
454496
/// <param name="prevCharKind">the kind of the character that led to this state</param>
455497
/// <param name="capturing">whether to use the separate space of states with capturing transitions or not</param>
498+
/// <param name="isInitialState">whether to mark the state as an initial state or not</param>
456499
/// <returns></returns>
457-
public DfaMatchingState<TSet> CreateState(SymbolicRegexNode<TSet> node, uint prevCharKind, bool capturing = false)
500+
public DfaMatchingState<TSet> CreateState(SymbolicRegexNode<TSet> node, uint prevCharKind, bool capturing = false, bool isInitialState = false)
458501
{
459502
//first prune the anchors in the node
460503
TSet wlbSet = _wordLetterForBoundariesSet;
@@ -469,21 +512,21 @@ public DfaMatchingState<TSet> CreateState(SymbolicRegexNode<TSet> node, uint pre
469512
var s = new DfaMatchingState<TSet>(pruned_node, prevCharKind);
470513
if (!(capturing ? _capturingStateCache : _stateCache).TryGetValue(s, out DfaMatchingState<TSet>? state))
471514
{
472-
state = MakeNewState(s, capturing);
515+
state = MakeNewState(s, capturing, isInitialState);
473516
}
474517

475518
return state;
476519
}
477520

478-
private DfaMatchingState<TSet> MakeNewState(DfaMatchingState<TSet> state, bool capturing)
521+
private DfaMatchingState<TSet> MakeNewState(DfaMatchingState<TSet> state, bool capturing, bool isInitialState)
479522
{
480523
lock (this)
481524
{
482525
HashSet<DfaMatchingState<TSet>> cache = capturing ? _capturingStateCache : _stateCache;
526+
cache.Add(state); // Add to cache first to make 1 the first state ID
483527
state.Id = cache.Count;
484-
cache.Add(state);
485528

486-
Debug.Assert(_stateArray is not null && _capturingStateArray is not null);
529+
Debug.Assert(_stateArray is not null && _capturingStateArray is not null && _stateInfo is not null);
487530

488531
const int GrowthSize = 1024;
489532
if (capturing)
@@ -503,8 +546,10 @@ private DfaMatchingState<TSet> MakeNewState(DfaMatchingState<TSet> state, bool c
503546
int newsize = _stateArray.Length + GrowthSize;
504547
Array.Resize(ref _stateArray, newsize);
505548
Array.Resize(ref _delta, newsize << _mintermsLog);
549+
Array.Resize(ref _stateInfo, newsize);
506550
}
507551
_stateArray[state.Id] = state;
552+
SetStateInfo(state.Id, isInitialState, state.IsDeadend, state.Node.IsNullable, state.Node.CanBeNullable);
508553
}
509554
return state;
510555
}
@@ -549,13 +594,20 @@ private int MakeNewNfaState(int coreStateId)
549594
}
550595
}
551596

552-
/// <summary>Gets the core state corresponding to the NFA state</summary>
553-
public DfaMatchingState<TSet> GetCoreState(int nfaStateId)
597+
/// <summary>Gets the core state Id corresponding to the NFA state</summary>
598+
public int GetCoreStateId(int nfaStateId)
554599
{
555600
Debug.Assert(_stateArray is not null);
556601
Debug.Assert(nfaStateId < _nfaStateArray.Length);
557602
Debug.Assert(_nfaStateArray[nfaStateId] < _stateArray.Length);
558-
return _stateArray[_nfaStateArray[nfaStateId]];
603+
return _nfaStateArray[nfaStateId];
604+
}
605+
606+
/// <summary>Gets the core state corresponding to the NFA state</summary>
607+
public DfaMatchingState<TSet> GetCoreState(int nfaStateId)
608+
{
609+
Debug.Assert(_stateArray is not null);
610+
return _stateArray[GetCoreStateId(nfaStateId)];
559611
}
560612

561613
/// <summary>Critical region for defining a new core transition</summary>
@@ -570,13 +622,13 @@ public DfaMatchingState<TSet> CreateNewTransition(DfaMatchingState<TSet> sourceS
570622
public bool TryCreateNewTransition(
571623
DfaMatchingState<TSet> sourceState, int mintermId, int offset, bool checkThreshold, [NotNullWhen(true)] out DfaMatchingState<TSet>? nextState)
572624
{
573-
Debug.Assert(_delta is not null);
625+
Debug.Assert(_delta is not null && _stateArray is not null);
574626
lock (this)
575627
{
576628
Debug.Assert(offset < _delta.Length);
577629

578630
// check if meanwhile delta[offset] has become defined possibly by another thread
579-
DfaMatchingState<TSet>? targetState = _delta[offset];
631+
DfaMatchingState<TSet>? targetState = _stateArray[_delta[offset]];
580632
if (targetState is null)
581633
{
582634
if (checkThreshold && _stateCache.Count >= SymbolicRegexThresholds.NfaThreshold)
@@ -586,7 +638,7 @@ public bool TryCreateNewTransition(
586638
}
587639

588640
targetState = sourceState.Next(GetMinterm(mintermId));
589-
Volatile.Write(ref _delta[offset], targetState);
641+
Volatile.Write(ref _delta[offset], targetState.Id);
590642
}
591643

592644
nextState = targetState;
@@ -597,7 +649,7 @@ public bool TryCreateNewTransition(
597649
/// <summary>Gets or creates a new NFA transition.</summary>
598650
public int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffset)
599651
{
600-
Debug.Assert(_delta is not null);
652+
Debug.Assert(_delta is not null && _stateArray is not null);
601653
lock (this)
602654
{
603655
Debug.Assert(nfaOffset < _nfaDelta.Length);
@@ -609,7 +661,9 @@ public int[] CreateNewNfaTransition(int nfaStateId, int mintermId, int nfaOffset
609661
// Create the underlying transition from the core state corresponding to the nfa state
610662
DfaMatchingState<TSet> coreState = GetCoreState(nfaStateId);
611663
int coreOffset = (coreState.Id << _mintermsLog) | mintermId;
612-
DfaMatchingState<TSet>? coreTarget = _delta[coreOffset] ?? CreateNewTransition(coreState, mintermId, coreOffset);
664+
int coreTargetId = _delta[coreOffset];
665+
DfaMatchingState<TSet>? coreTarget = coreTargetId > 0 ?
666+
_stateArray[coreTargetId] : CreateNewTransition(coreState, mintermId, coreOffset);
613667

614668
SymbolicRegexNode<TSet> node = coreTarget.Node.Kind == SymbolicRegexNodeKind.DisableBacktrackingSimulation ?
615669
coreTarget.Node._left! : coreTarget.Node;

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Dgml.cs

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ public override void SaveDGML(TextWriter writer, int maxLabelLength)
2929
foreach (DfaMatchingState<TSet> state in _builder._stateCache)
3030
{
3131
writer.WriteLine(" <Node Id=\"{0}\" Label=\"{0}\" Category=\"State\" Group=\"Collapsed\" StateInfo=\"{1}\">", state.Id, state.DgmlView);
32-
if (state.IsInitialState)
32+
if (_builder.GetStateInfo(state.Id).IsInitial)
3333
{
3434
writer.WriteLine(" <Category Ref=\"InitialState\" />");
3535
}
@@ -143,16 +143,17 @@ public override void SaveDGML(TextWriter writer, int maxLabelLength)
143143
foreach (DfaMatchingState<TSet> source in builder._stateCache)
144144
{
145145
// Get the span of entries in delta that gives the transitions for the different minterms
146-
Span<DfaMatchingState<TSet>?> deltas = builder.GetDeltasFor(source);
146+
Span<int> deltas = builder.GetDeltasFor(source);
147147
Span<int[]?> nfaDeltas = builder.GetNfaDeltasFor(source);
148148
Debug.Assert(deltas.Length == builder._minterms.Length);
149149
for (int i = 0; i < deltas.Length; ++i)
150150
{
151-
// null entries are transitions not explored yet, so skip them
152-
if (deltas[i] is DfaMatchingState<TSet> target)
151+
// negative entries are transitions not explored yet, so skip them
152+
int targetId = deltas[i];
153+
if (targetId >= 0)
153154
{
154155
// Get or create the data for this (source,destination) state ID pair
155-
(int Source, int Target) key = (source.Id, target.Id);
156+
(int Source, int Target) key = (source.Id, targetId);
156157
if (!result.TryGetValue(key, out (TSet Rule, List<int> NfaTargets) entry))
157158
{
158159
entry = (builder._solver.Empty, new List<int>());

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/SymbolicRegexMatcher.Sample.cs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -83,25 +83,25 @@ public override IEnumerable<string> SampleMatches(int k, int randomseed)
8383
{
8484
// Unconditionally final state or end of the input due to \Z anchor for example
8585
if (NfaStateHandler.IsNullable(ref statesWrapper) ||
86-
NfaStateHandler.IsNullable(ref statesWrapper, CharKind.BeginningEnd))
86+
NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.BeginningEnd))
8787
{
8888
possibleEndings.Add("");
8989
}
9090

9191
// End of line due to end-of-line anchor
92-
if (NfaStateHandler.IsNullable(ref statesWrapper, CharKind.Newline))
92+
if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.Newline))
9393
{
9494
possibleEndings.Add("\n");
9595
}
9696

9797
// Related to wordborder due to \b or \B
98-
if (NfaStateHandler.IsNullable(ref statesWrapper, CharKind.WordLetter))
98+
if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.WordLetter))
9999
{
100100
possibleEndings.Add(ChooseChar(random, asciiWordCharacters, ascii, charSetSolver).ToString());
101101
}
102102

103103
// Related to wordborder due to \b or \B
104-
if (NfaStateHandler.IsNullable(ref statesWrapper, CharKind.General))
104+
if (NfaStateHandler.IsNullableFor(_builder, ref statesWrapper, CharKind.General))
105105
{
106106
possibleEndings.Add(ChooseChar(random, asciiNonWordCharacters, ascii, charSetSolver).ToString());
107107
}
@@ -125,7 +125,7 @@ public override IEnumerable<string> SampleMatches(int k, int randomseed)
125125
}
126126

127127
// Shuffle the minterms, including the last end-of-line marker if appropriate
128-
int[] mintermIds = NfaStateHandler.StartsWithLineAnchor(ref statesWrapper) ?
128+
int[] mintermIds = NfaStateHandler.StartsWithLineAnchor(_builder, ref statesWrapper) ?
129129
Shuffle(random, mintermIdsWithZ) :
130130
Shuffle(random, mintermIdsWithoutZ);
131131
foreach (int mintermId in mintermIds)

0 commit comments

Comments
 (0)