@@ -18,7 +18,7 @@ internal static class IndexOfAnyAsciiSearcher
1818 public struct AsciiState ( Vector128 < byte > bitmap , BitVector256 lookup )
1919 {
2020 public Vector512 < byte > Bitmap512 = Vector512 . Create ( bitmap ) ;
21- public BitVector256 Lookup = lookup ;
21+ public readonly BitVector256 Lookup = lookup ;
2222
2323 [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
2424 public readonly Vector128 < byte > Bitmap128 ( ) => Bitmap512 . _lower . _lower ;
@@ -30,19 +30,31 @@ public readonly AsciiState CreateInverse() =>
3030 new AsciiState ( ~ Bitmap128 ( ) , Lookup . CreateInverse ( ) ) ;
3131 }
3232
33- public struct AsciiWithSecondSetState ( Vector128 < byte > asciiBitmap , ushort offset , Vector128 < byte > secondBitmap , ProbabilisticMapState lookup )
33+ public readonly struct AsciiWithSecondSetState ( Vector128 < byte > asciiBitmap , ushort offset , Vector128 < byte > secondBitmap , ProbabilisticMapState lookup )
3434 {
35- public ushort Offset = offset ;
36- public Vector256 < byte > AsciiBitmap = Vector256 . Create ( asciiBitmap , asciiBitmap ) ;
37- public Vector256 < byte > SecondBitmap = Vector256 . Create ( secondBitmap , secondBitmap ) ;
38- public ProbabilisticMapState Lookup = lookup ; // Only used for single-character checks.
35+ public readonly ushort Offset = offset ;
36+ public readonly Vector512 < byte > AsciiBitmap512 = Vector512 . Create ( asciiBitmap ) ;
37+ public readonly Vector512 < byte > SecondBitmap512 = Vector512 . Create ( secondBitmap ) ;
38+ public readonly ProbabilisticMapState Lookup = lookup ; // Only used for single-character checks.
39+
40+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
41+ public readonly Vector128 < byte > AsciiBitmap128 ( ) => AsciiBitmap512 . _lower . _lower ;
42+
43+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
44+ public readonly Vector128 < byte > SecondBitmap128 ( ) => SecondBitmap512 . _lower . _lower ;
45+
46+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
47+ public readonly Vector256 < byte > AsciiBitmap256 ( ) => AsciiBitmap512 . _lower ;
48+
49+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
50+ public readonly Vector256 < byte > SecondBitmap256 ( ) => SecondBitmap512 . _lower ;
3951 }
4052
41- public struct AnyByteState ( Vector128 < byte > bitmap0 , Vector128 < byte > bitmap1 , BitVector256 lookup )
53+ public readonly struct AnyByteState ( Vector128 < byte > bitmap0 , Vector128 < byte > bitmap1 , BitVector256 lookup )
4254 {
43- public Vector512 < byte > Bitmap0_512 = Vector512 . Create ( bitmap0 ) ;
44- public Vector512 < byte > Bitmap1_512 = Vector512 . Create ( bitmap1 ) ;
45- public BitVector256 Lookup = lookup ;
55+ public readonly Vector512 < byte > Bitmap0_512 = Vector512 . Create ( bitmap0 ) ;
56+ public readonly Vector512 < byte > Bitmap1_512 = Vector512 . Create ( bitmap1 ) ;
57+ public readonly BitVector256 Lookup = lookup ;
4658
4759 [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
4860 public readonly Vector128 < byte > Bitmap0_128 ( ) => Bitmap0_512 . _lower . _lower ;
@@ -715,11 +727,72 @@ private static TResult IndexOfAnyCore<TResult, TNegator, TOptimizations, TResult
715727 if ( Avx2 . IsSupported && searchSpaceLength > 2 * Vector128 < short > . Count )
716728#pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
717729 {
718- Vector256 < byte > asciiBitmap256 = state . AsciiBitmap ;
719- Vector256 < byte > secondBitmap256 = state . SecondBitmap ;
730+ #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx512BW.IsSupported is false
731+ if ( Vector512 . IsHardwareAccelerated && Avx512BW . IsSupported && searchSpaceLength > 2 * Vector256 < short > . Count )
732+ #pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
733+ {
734+ Vector512 < byte > asciiBitmap512 = state . AsciiBitmap512 ;
735+ Vector512 < byte > secondBitmap512 = state . SecondBitmap512 ;
736+ Vector512 < ushort > offset512 = Vector512 . Create ( state . Offset ) ;
737+
738+ if ( searchSpaceLength > 2 * Vector512 < short > . Count )
739+ {
740+ // Process the input in chunks of 64 characters (2 * Vector512<short>).
741+ // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector512<byte>.
742+ // As packing two Vector512<short>s into a Vector512<byte> is cheap compared to the lookup, we can effectively double the throughput.
743+ // If the input length is a multiple of 64, don't consume the last 64 characters in this loop.
744+ // Let the fallback below handle it instead. This is why the condition is
745+ // ">" instead of ">=" above, and why "IsAddressLessThan" is used instead of "!IsAddressGreaterThan".
746+ ref short twoVectorsAwayFromEnd = ref Unsafe . Add ( ref searchSpace , searchSpaceLength - ( 2 * Vector512 < short > . Count ) ) ;
747+
748+ do
749+ {
750+ Vector512 < short > source0 = Vector512 . LoadUnsafe ( ref currentSearchSpace ) ;
751+ Vector512 < short > source1 = Vector512 . LoadUnsafe ( ref currentSearchSpace , ( nuint ) Vector512 < short > . Count ) ;
752+
753+ Vector512 < byte > result = IndexOfAnyLookup < TNegator , TOptimizations > ( source0 , source1 , asciiBitmap512 , secondBitmap512 , offset512 ) ;
754+ if ( result != Vector512 < byte > . Zero )
755+ {
756+ return TResultMapper . FirstIndex < TNegator > ( ref searchSpace , ref currentSearchSpace , result ) ;
757+ }
758+
759+ currentSearchSpace = ref Unsafe . Add ( ref currentSearchSpace , 2 * Vector512 < short > . Count ) ;
760+ }
761+ while ( Unsafe . IsAddressLessThan ( ref currentSearchSpace , ref twoVectorsAwayFromEnd ) ) ;
762+ }
763+
764+ // We have 1-64 characters remaining. Process the first and last vector in the search space.
765+ // They may overlap, but we'll handle that in the index calculation if we do get a match.
766+ Debug . Assert ( searchSpaceLength >= Vector512 < short > . Count , "We expect that the input is long enough for us to load a whole vector." ) ;
767+ {
768+ ref short oneVectorAwayFromEnd = ref Unsafe . Add ( ref searchSpace , searchSpaceLength - Vector512 < short > . Count ) ;
769+
770+ ref short firstVector = ref Unsafe . IsAddressGreaterThan ( ref currentSearchSpace , ref oneVectorAwayFromEnd )
771+ ? ref oneVectorAwayFromEnd
772+ : ref currentSearchSpace ;
773+
774+ Vector512 < short > source0 = Vector512 . LoadUnsafe ( ref firstVector ) ;
775+ Vector512 < short > source1 = Vector512 . LoadUnsafe ( ref oneVectorAwayFromEnd ) ;
776+
777+ Vector512 < byte > result = IndexOfAnyLookup < TNegator , TOptimizations > ( source0 , source1 , asciiBitmap512 , secondBitmap512 , offset512 ) ;
778+ if ( result != Vector512 < byte > . Zero )
779+ {
780+ return TResultMapper . FirstIndexOverlapped < TNegator > ( ref searchSpace , ref firstVector , ref oneVectorAwayFromEnd , result ) ;
781+ }
782+ }
783+
784+ return TResultMapper . NotFound ;
785+ }
786+
787+ Vector256 < byte > asciiBitmap256 = state . AsciiBitmap256 ( ) ;
788+ Vector256 < byte > secondBitmap256 = state . SecondBitmap256 ( ) ;
720789 Vector256 < ushort > offset256 = Vector256 . Create ( state . Offset ) ;
721790
722- if ( searchSpaceLength > 2 * Vector256 < short > . Count )
791+ #pragma warning disable IntrinsicsInSystemPrivateCoreLibConditionParsing // A negated IsSupported condition isn't parseable by the intrinsics analyzer
792+ #pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx512BW.IsSupported is false
793+ if ( ! ( Vector512 . IsHardwareAccelerated && Avx512BW . IsSupported ) && searchSpaceLength > 2 * Vector256 < short > . Count )
794+ #pragma warning restore IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
795+ #pragma warning restore IntrinsicsInSystemPrivateCoreLibConditionParsing
723796 {
724797 // Process the input in chunks of 32 characters (2 * Vector256<short>).
725798 // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector256<byte>.
@@ -768,8 +841,8 @@ private static TResult IndexOfAnyCore<TResult, TNegator, TOptimizations, TResult
768841 return TResultMapper . NotFound ;
769842 }
770843
771- Vector128 < byte > asciiBitmap = state . AsciiBitmap . _lower ;
772- Vector128 < byte > secondBitmap = state . SecondBitmap . _lower ;
844+ Vector128 < byte > asciiBitmap = state . AsciiBitmap128 ( ) ;
845+ Vector128 < byte > secondBitmap = state . SecondBitmap128 ( ) ;
773846 Vector128 < ushort > offset = Vector128 . Create ( state . Offset ) ;
774847
775848#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough // The behavior of the rest of the function remains the same if Avx2.IsSupported is false
@@ -850,11 +923,68 @@ public static int LastIndexOfAny<TNegator, TOptimizations>(ref short searchSpace
850923 if ( Avx2 . IsSupported && searchSpaceLength > 2 * Vector128 < short > . Count )
851924#pragma warning disable IntrinsicsInSystemPrivateCoreLibAttributeNotSpecificEnough
852925 {
853- Vector256 < byte > asciiBitmap256 = state . AsciiBitmap ;
854- Vector256 < byte > secondBitmap256 = state . SecondBitmap ;
926+ if ( Vector512 . IsHardwareAccelerated && Avx512BW . IsSupported && searchSpaceLength > 2 * Vector256 < short > . Count )
927+ {
928+ Vector512 < byte > asciiBitmap512 = state . AsciiBitmap512 ;
929+ Vector512 < byte > secondBitmap512 = state . SecondBitmap512 ;
930+ Vector512 < ushort > offset512 = Vector512 . Create ( state . Offset ) ;
931+
932+ if ( searchSpaceLength > 2 * Vector512 < short > . Count )
933+ {
934+ // Process the input in chunks of 64 characters (2 * Vector512<short>).
935+ // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector512<byte>.
936+ // As packing two Vector512<short>s into a Vector512<byte> is cheap compared to the lookup, we can effectively double the throughput.
937+ // If the input length is a multiple of 64, don't consume the last 64 characters in this loop.
938+ // Let the fallback below handle it instead. This is why the condition is
939+ // ">" instead of ">=" above, and why "IsAddressGreaterThan" is used instead of "!IsAddressLessThan".
940+ ref short twoVectorsAfterStart = ref Unsafe . Add ( ref searchSpace , 2 * Vector512 < short > . Count ) ;
941+
942+ do
943+ {
944+ currentSearchSpace = ref Unsafe . Subtract ( ref currentSearchSpace , 2 * Vector512 < short > . Count ) ;
945+
946+ Vector512 < short > source0 = Vector512 . LoadUnsafe ( ref currentSearchSpace ) ;
947+ Vector512 < short > source1 = Vector512 . LoadUnsafe ( ref currentSearchSpace , ( nuint ) Vector512 < short > . Count ) ;
948+
949+ Vector512 < byte > result = IndexOfAnyLookup < TNegator , TOptimizations > ( source0 , source1 , asciiBitmap512 , secondBitmap512 , offset512 ) ;
950+ if ( result != Vector512 < byte > . Zero )
951+ {
952+ return ComputeLastIndex < short , TNegator > ( ref searchSpace , ref currentSearchSpace , result ) ;
953+ }
954+ }
955+ while ( Unsafe . IsAddressGreaterThan ( ref currentSearchSpace , ref twoVectorsAfterStart ) ) ;
956+ }
957+
958+ // We have 1-64 characters remaining. Process the first and last vector in the search space.
959+ // They may overlap, but we'll handle that in the index calculation if we do get a match.
960+ Debug . Assert ( searchSpaceLength >= Vector512 < short > . Count , "We expect that the input is long enough for us to load a whole vector." ) ;
961+ {
962+ ref short oneVectorAfterStart = ref Unsafe . Add ( ref searchSpace , Vector512 < short > . Count ) ;
963+
964+ ref short secondVector = ref Unsafe . IsAddressGreaterThan ( ref currentSearchSpace , ref oneVectorAfterStart )
965+ ? ref Unsafe . Subtract ( ref currentSearchSpace , Vector512 < short > . Count )
966+ : ref searchSpace ;
967+
968+ Vector512 < short > source0 = Vector512 . LoadUnsafe ( ref searchSpace ) ;
969+ Vector512 < short > source1 = Vector512 . LoadUnsafe ( ref secondVector ) ;
970+
971+ Vector512 < byte > result = IndexOfAnyLookup < TNegator , TOptimizations > ( source0 , source1 , asciiBitmap512 , secondBitmap512 , offset512 ) ;
972+ if ( result != Vector512 < byte > . Zero )
973+ {
974+ return ComputeLastIndexOverlapped < short , TNegator > ( ref searchSpace , ref secondVector , result ) ;
975+ }
976+ }
977+
978+ return - 1 ;
979+ }
980+
981+ Vector256 < byte > asciiBitmap256 = state . AsciiBitmap256 ( ) ;
982+ Vector256 < byte > secondBitmap256 = state . SecondBitmap256 ( ) ;
855983 Vector256 < ushort > offset256 = Vector256 . Create ( state . Offset ) ;
856984
857- if ( searchSpaceLength > 2 * Vector256 < short > . Count )
985+ #pragma warning disable IntrinsicsInSystemPrivateCoreLibConditionParsing // A negated IsSupported condition isn't parseable by the intrinsics analyzer
986+ if ( ! ( Vector512 . IsHardwareAccelerated && Avx512BW . IsSupported ) && searchSpaceLength > 2 * Vector256 < short > . Count )
987+ #pragma warning restore IntrinsicsInSystemPrivateCoreLibConditionParsing
858988 {
859989 // Process the input in chunks of 32 characters (2 * Vector256<short>).
860990 // We're mainly interested in a single byte of each character, and the core lookup operates on a Vector256<byte>.
@@ -903,8 +1033,8 @@ public static int LastIndexOfAny<TNegator, TOptimizations>(ref short searchSpace
9031033 return - 1 ;
9041034 }
9051035
906- Vector128 < byte > asciiBitmap = state . AsciiBitmap . _lower ;
907- Vector128 < byte > secondBitmap = state . SecondBitmap . _lower ;
1036+ Vector128 < byte > asciiBitmap = state . AsciiBitmap128 ( ) ;
1037+ Vector128 < byte > secondBitmap = state . SecondBitmap128 ( ) ;
9081038 Vector128 < ushort > offset = Vector128 . Create ( state . Offset ) ;
9091039
9101040 if ( ! Avx2 . IsSupported && searchSpaceLength > 2 * Vector128 < short > . Count )
@@ -1845,6 +1975,23 @@ private static Vector512<byte> IndexOfAnyLookup<TNegator, TOptimizations>(Vector
18451975 return TNegator . NegateIfNeeded ( result ) ;
18461976 }
18471977
1978+ [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
1979+ [ CompExactlyDependsOn ( typeof ( Avx512BW ) ) ]
1980+ private static Vector512 < byte > IndexOfAnyLookup < TNegator , TOptimizations > ( Vector512 < short > source0 , Vector512 < short > source1 , Vector512 < byte > bitmapLookup0 , Vector512 < byte > bitmapLookup1 , Vector512 < ushort > offset )
1981+ where TNegator : struct , INegator
1982+ where TOptimizations : struct , IOptimizations
1983+ {
1984+ Debug . Assert ( ( bitmapLookup1 [ 0 ] & 1 ) == 0 , "The 0th bit in second bitmap shouldn't be set." ) ;
1985+
1986+ Vector512 < byte > packed0 = TOptimizations . PackSources ( source0 . AsUInt16 ( ) , source1 . AsUInt16 ( ) ) ;
1987+ Vector512 < byte > packed1 = Default . PackSources ( source0 . AsUInt16 ( ) - offset , source1 . AsUInt16 ( ) - offset ) ;
1988+
1989+ Vector512 < byte > result0 = IndexOfAnyLookupCore ( packed0 , bitmapLookup0 ) ;
1990+ Vector512 < byte > result1 = IndexOfAnyLookupCore ( packed1 , bitmapLookup1 ) ;
1991+
1992+ return TNegator . NegateIfNeeded ( result0 | result1 ) ;
1993+ }
1994+
18481995 [ MethodImpl ( MethodImplOptions . AggressiveInlining ) ]
18491996 [ CompExactlyDependsOn ( typeof ( Avx512BW ) ) ]
18501997 private static Vector512 < byte > IndexOfAnyLookupCore ( Vector512 < byte > source , Vector512 < byte > bitmapLookup )
0 commit comments