Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),

### Changed
- Add lower limit for primary and replica batch allocators timeout ([#14979](https://github.com/opensearch-project/OpenSearch/pull/14979))
- Optimize regexp-based include/exclude on aggregations when pattern matches prefixes ([#14371](https://github.com/opensearch-project/OpenSearch/pull/14371))
- Replace and block usages of org.apache.logging.log4j.util.Strings ([#15238](https://github.com/opensearch-project/OpenSearch/pull/15238))

### Deprecated
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,11 @@
import org.opensearch.search.DocValueFormat;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.SortedSet;
Expand Down Expand Up @@ -86,6 +90,10 @@ public class IncludeExclude implements Writeable, ToXContentFragment {
* https://github.com/opensearch-project/OpenSearch/issues/2858
*/
private static final int DEFAULT_MAX_REGEX_LENGTH = 1000;
/**
* The maximum number of prefixes to extract from a regex in tryCreatePrefixOrdinalsFilter
*/
private static final int MAX_PREFIXES = 1000;

// for parsing purposes only
// TODO: move all aggs to the same package so that this stuff could be pkg-private
Expand Down Expand Up @@ -393,6 +401,92 @@ public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) thro

}

/**
* An ordinals filter that includes/excludes all ordinals corresponding to terms starting with the given prefixes
*/
static class PrefixBackedOrdinalsFilter extends OrdinalsFilter {

private final SortedSet<BytesRef> includePrefixes, excludePrefixes;

private PrefixBackedOrdinalsFilter(SortedSet<BytesRef> includePrefixes, SortedSet<BytesRef> excludePrefixes) {
this.includePrefixes = includePrefixes;
this.excludePrefixes = excludePrefixes;
}

private static BytesRef nextBytesRef(BytesRef bytesRef) {
BytesRef next = BytesRef.deepCopyOf(bytesRef);
int pos = next.offset + next.length - 1;
while (pos >= next.offset && next.bytes[pos] == -1) {
next.bytes[pos] = 0;
pos--;
}
if (pos >= next.offset) {
next.bytes[pos]++;
} else {
// Every byte in our prefix had value 0xFF. We must match all subsequent ordinals.
return null;
}
return next;
}

private interface LongBiConsumer {
void accept(long a, long b);
}

private static void process(SortedSetDocValues globalOrdinals, long length, SortedSet<BytesRef> prefixes, LongBiConsumer consumer)
throws IOException {
for (BytesRef prefix : prefixes) {
long startOrd = globalOrdinals.lookupTerm(prefix);
if (startOrd < 0) {
// The prefix is not an exact match in the ordinals (can skip equal length below)
startOrd = -1 - startOrd;
// Make sure that the term at startOrd starts with prefix
BytesRef startTerm = globalOrdinals.lookupOrd(startOrd);
if (startTerm.length <= prefix.length
|| !Arrays.equals(
startTerm.bytes,
startTerm.offset,
startTerm.offset + prefix.length,
prefix.bytes,
prefix.offset,
prefix.offset + prefix.length
)) {
continue;
}
}
if (startOrd >= length) {
continue;
}
BytesRef next = nextBytesRef(prefix);
if (next == null) {
consumer.accept(startOrd, length);
} else {
long endOrd = globalOrdinals.lookupTerm(next);
if (endOrd < 0) {
endOrd = -1 - endOrd;
}
if (startOrd < endOrd) {
consumer.accept(startOrd, endOrd);
}
}
}

}

@Override
public LongBitSet acceptedGlobalOrdinals(SortedSetDocValues globalOrdinals) throws IOException {
LongBitSet accept = new LongBitSet(globalOrdinals.getValueCount());
if (!includePrefixes.isEmpty()) {
process(globalOrdinals, accept.length(), includePrefixes, accept::set);
} else if (accept.length() > 0) {
// Exclude-only
accept.set(0, accept.length());
}
process(globalOrdinals, accept.length(), excludePrefixes, accept::clear);
return accept;
}
}

private final String include, exclude;
private final SortedSet<BytesRef> includeValues, excludeValues;
private final int incZeroBasedPartition;
Expand Down Expand Up @@ -709,8 +803,13 @@ public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format) {
}

public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format, int maxRegexLength) {

if (isRegexBased()) {
if ((include == null || include.endsWith(".*")) && (exclude == null || exclude.endsWith(".*"))) {
PrefixBackedOrdinalsFilter prefixBackedOrdinalsFilter = tryCreatePrefixOrdinalsFilter(maxRegexLength);
if (prefixBackedOrdinalsFilter != null) {
return prefixBackedOrdinalsFilter;
}
}
return new AutomatonBackedOrdinalsFilter(toAutomaton(maxRegexLength));
}
if (isPartitionBased()) {
Expand All @@ -720,6 +819,94 @@ public OrdinalsFilter convertToOrdinalsFilter(DocValueFormat format, int maxRege
return new TermListBackedOrdinalsFilter(parseForDocValues(includeValues, format), parseForDocValues(excludeValues, format));
}

private static List<String> expandRegexp(RegExp regExp, int maxPrefixes) {
switch (regExp.kind) {
case REGEXP_UNION:
List<RegExp> alternatives = new ArrayList<>();
while (regExp.exp1.kind == RegExp.Kind.REGEXP_UNION) {
alternatives.add(regExp.exp2);
regExp = regExp.exp1;
}
alternatives.add(regExp.exp2);
alternatives.add(regExp.exp1);
List<String> output = new ArrayList<>();
for (RegExp leaf : alternatives) {
List<String> leafExpansions = expandRegexp(leaf, maxPrefixes);
if (leafExpansions == null) {
return null;
} else {
if (output.size() + leafExpansions.size() > maxPrefixes) {
return null;
}
output.addAll(leafExpansions);
}
}
return output;
case REGEXP_CONCATENATION:
List<String> prefixes = expandRegexp(regExp.exp1, maxPrefixes);
if (prefixes == null) {
return null;
}
List<String> suffixes = expandRegexp(regExp.exp2, maxPrefixes);
if (suffixes == null) {
return null;
}
if (prefixes.size() * suffixes.size() > maxPrefixes) {
return null;
}
List<String> out = new ArrayList<>();
StringBuilder stringBuilder = new StringBuilder();
for (String prefix : prefixes) {
for (String suffix : suffixes) {
stringBuilder.setLength(0);
stringBuilder.append(prefix).append(suffix);
out.add(stringBuilder.toString());
}
}
return out;
case REGEXP_CHAR:
return List.of(Character.toString(regExp.c));
case REGEXP_STRING:
return List.of(regExp.s);
default:
return null;
}
}

static SortedSet<BytesRef> extractPrefixes(String pattern, int maxRegexLength) {
if (pattern == null) {
return Collections.emptySortedSet();
}
SortedSet<BytesRef> prefixSet = null;
validateRegExpStringLength(pattern, maxRegexLength);
RegExp regExp = new RegExp(pattern);
if (regExp.kind == RegExp.Kind.REGEXP_CONCATENATION && regExp.exp2.kind == RegExp.Kind.REGEXP_REPEAT) {
RegExp tail = regExp.exp2.exp1;
if (tail.kind == RegExp.Kind.REGEXP_ANYCHAR || tail.kind == RegExp.Kind.REGEXP_ANYSTRING) {
List<String> prefixes = expandRegexp(regExp.exp1, MAX_PREFIXES);
if (prefixes != null) {
prefixSet = new TreeSet<>();
for (String prefix : prefixes) {
prefixSet.add(new BytesRef(prefix));
}
}
}
}
return prefixSet;
}

private PrefixBackedOrdinalsFilter tryCreatePrefixOrdinalsFilter(int maxRegexLength) {
SortedSet<BytesRef> includeSet = extractPrefixes(include, maxRegexLength);
if (includeSet == null) {
return null;
}
SortedSet<BytesRef> excludeSet = extractPrefixes(exclude, maxRegexLength);
if (excludeSet == null) {
return null;
}
return new PrefixBackedOrdinalsFilter(includeSet, excludeSet);
}

public LongFilter convertToLongFilter(DocValueFormat format) {

if (isPartitionBased()) {
Expand Down
Loading