Skip to content

Commit f092a04

Browse files
committed
Add restore level safeguards to prevent file cache oversubscription
Signed-off-by: Kunal Kotwani <kkotwani@amazon.com>
1 parent 78537a2 commit f092a04

12 files changed

Lines changed: 282 additions & 19 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
7676
### Added
7777
- Add server version as REST response header [#6583](https://github.com/opensearch-project/OpenSearch/issues/6583)
7878
- Start replication checkpointTimers on primary before segments upload to remote store. ([#8221]()https://github.com/opensearch-project/OpenSearch/pull/8221)
79+
- Add configuration for file cache size to max remote data ratio to prevent oversubscription of file cache ([#8606]()https://github.com/opensearch-project/OpenSearch/pull/8606)
7980

8081
### Dependencies
8182
- Bump `org.apache.logging.log4j:log4j-core` from 2.17.1 to 2.20.0 ([#8307](https://github.com/opensearch-project/OpenSearch/pull/8307))

server/src/main/java/org/opensearch/action/admin/cluster/snapshots/restore/RestoreSnapshotRequest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -517,7 +517,7 @@ public String snapshotUuid() {
517517
/**
518518
* Sets the storage type for this request.
519519
*/
520-
RestoreSnapshotRequest storageType(StorageType storageType) {
520+
public RestoreSnapshotRequest storageType(StorageType storageType) {
521521
this.storageType = storageType;
522522
return this;
523523
}

server/src/main/java/org/opensearch/cluster/routing/RoutingTable.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,16 @@ public ShardsIterator allShardsIncludingRelocationTargets(String[] indices) {
295295
return allShardsSatisfyingPredicate(indices, shardRouting -> true, true);
296296
}
297297

298+
/**
299+
* All the shards on the node which match the predicate
300+
* @param predicate condition to match
301+
* @return iterator over shards matching the predicate
302+
*/
303+
public ShardsIterator allShardsSatisfyingPredicate(Predicate<ShardRouting> predicate) {
304+
String[] indices = indicesRouting.keySet().toArray(new String[0]);
305+
return allShardsSatisfyingPredicate(indices, predicate, false);
306+
}
307+
298308
private ShardsIterator allShardsSatisfyingPredicate(
299309
String[] indices,
300310
Predicate<ShardRouting> predicate,

server/src/main/java/org/opensearch/cluster/routing/allocation/decider/DiskThresholdDecider.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@
6868
import static org.opensearch.cluster.routing.RoutingPool.getShardPool;
6969
import static org.opensearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING;
7070
import static org.opensearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING;
71-
import static org.opensearch.index.store.remote.filecache.FileCache.DATA_TO_FILE_CACHE_SIZE_RATIO;
71+
import static org.opensearch.index.store.remote.filecache.FileCache.DATA_TO_FILE_CACHE_SIZE_RATIO_SETTING;
7272

7373
/**
7474
* The {@link DiskThresholdDecider} checks that the node a shard is potentially
@@ -199,8 +199,8 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing
199199
final FileCacheStats fileCacheStats = clusterInfo.getNodeFileCacheStats().getOrDefault(node.nodeId(), null);
200200
final long nodeCacheSize = fileCacheStats != null ? fileCacheStats.getTotal().getBytes() : 0;
201201
final long totalNodeRemoteShardSize = currentNodeRemoteShardSize + shardSize;
202-
203-
if (totalNodeRemoteShardSize > DATA_TO_FILE_CACHE_SIZE_RATIO * nodeCacheSize) {
202+
final int dataToFileCacheSizeRatio = DATA_TO_FILE_CACHE_SIZE_RATIO_SETTING.get(allocation.metadata().settings());
203+
if (dataToFileCacheSizeRatio > 0 && totalNodeRemoteShardSize > dataToFileCacheSizeRatio * nodeCacheSize) {
204204
return allocation.decision(
205205
Decision.NO,
206206
NAME,

server/src/main/java/org/opensearch/index/store/remote/filecache/FileCache.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import org.apache.lucene.store.IndexInput;
1212
import org.opensearch.common.breaker.CircuitBreaker;
1313
import org.opensearch.common.breaker.CircuitBreakingException;
14+
import org.opensearch.common.settings.Setting;
1415
import org.opensearch.index.store.remote.utils.cache.CacheUsage;
1516
import org.opensearch.index.store.remote.utils.cache.RefCountedCache;
1617
import org.opensearch.index.store.remote.utils.cache.SegmentedCache;
@@ -49,8 +50,12 @@ public class FileCache implements RefCountedCache<Path, CachedIndexInput> {
4950

5051
private final CircuitBreaker circuitBreaker;
5152

52-
// TODO: Convert the constant into an integer setting
53-
public static final int DATA_TO_FILE_CACHE_SIZE_RATIO = 5;
53+
public static final Setting<Integer> DATA_TO_FILE_CACHE_SIZE_RATIO_SETTING = Setting.intSetting(
54+
"cluster.filecache.remote_data_ratio",
55+
0,
56+
0,
57+
Setting.Property.NodeScope
58+
);
5459

5560
public FileCache(SegmentedCache<Path, CachedIndexInput> cache, CircuitBreaker circuitBreaker) {
5661
this.theCache = cache;

server/src/main/java/org/opensearch/node/Node.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -941,8 +941,9 @@ protected Node(
941941
clusterModule.getAllocationService(),
942942
metadataCreateIndexService,
943943
metadataIndexUpgradeService,
944-
clusterService.getClusterSettings(),
945-
shardLimitValidator
944+
shardLimitValidator,
945+
indicesService,
946+
clusterInfoService
946947
);
947948

948949
final DiskThresholdMonitor diskThresholdMonitor = new DiskThresholdMonitor(

server/src/main/java/org/opensearch/snapshots/RestoreService.java

Lines changed: 68 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
import org.opensearch.action.admin.cluster.snapshots.restore.RestoreSnapshotRequest;
4242
import org.opensearch.action.support.IndicesOptions;
4343
import org.opensearch.cluster.ClusterChangedEvent;
44+
import org.opensearch.cluster.ClusterInfo;
45+
import org.opensearch.cluster.ClusterInfoService;
4446
import org.opensearch.cluster.ClusterState;
4547
import org.opensearch.cluster.ClusterStateApplier;
4648
import org.opensearch.cluster.ClusterStateTaskConfig;
@@ -68,6 +70,7 @@
6870
import org.opensearch.cluster.routing.RoutingChangesObserver;
6971
import org.opensearch.cluster.routing.RoutingTable;
7072
import org.opensearch.cluster.routing.ShardRouting;
73+
import org.opensearch.cluster.routing.ShardsIterator;
7174
import org.opensearch.cluster.routing.UnassignedInfo;
7275
import org.opensearch.cluster.routing.allocation.AllocationService;
7376
import org.opensearch.cluster.service.ClusterManagerTaskKeys;
@@ -86,6 +89,9 @@
8689
import org.opensearch.index.IndexSettings;
8790
import org.opensearch.index.shard.IndexShard;
8891
import org.opensearch.core.index.shard.ShardId;
92+
import org.opensearch.index.snapshots.IndexShardSnapshotStatus;
93+
import org.opensearch.index.store.remote.filecache.FileCacheStats;
94+
import org.opensearch.indices.IndicesService;
8995
import org.opensearch.indices.ShardLimitValidator;
9096
import org.opensearch.repositories.IndexId;
9197
import org.opensearch.repositories.RepositoriesService;
@@ -118,6 +124,7 @@
118124
import static org.opensearch.common.util.FeatureFlags.SEARCHABLE_SNAPSHOT_EXTENDED_COMPATIBILITY;
119125
import static org.opensearch.common.util.set.Sets.newHashSet;
120126
import static org.opensearch.index.store.remote.directory.RemoteSnapshotDirectory.SEARCHABLE_SNAPSHOT_EXTENDED_COMPATIBILITY_MINIMUM_VERSION;
127+
import static org.opensearch.index.store.remote.filecache.FileCache.DATA_TO_FILE_CACHE_SIZE_RATIO_SETTING;
121128
import static org.opensearch.snapshots.SnapshotUtils.filterIndices;
122129

123130
/**
@@ -176,6 +183,10 @@ public class RestoreService implements ClusterStateApplier {
176183

177184
private final ClusterSettings clusterSettings;
178185

186+
private final IndicesService indicesService;
187+
188+
private final ClusterInfoService clusterInfoService;
189+
179190
private final ClusterManagerTaskThrottler.ThrottlingKey restoreSnapshotTaskKey;
180191

181192
private static final CleanRestoreStateTaskExecutor cleanRestoreStateTaskExecutor = new CleanRestoreStateTaskExecutor();
@@ -186,8 +197,9 @@ public RestoreService(
186197
AllocationService allocationService,
187198
MetadataCreateIndexService createIndexService,
188199
MetadataIndexUpgradeService metadataIndexUpgradeService,
189-
ClusterSettings clusterSettings,
190-
ShardLimitValidator shardLimitValidator
200+
ShardLimitValidator shardLimitValidator,
201+
IndicesService indicesService,
202+
ClusterInfoService clusterInfoService
191203
) {
192204
this.clusterService = clusterService;
193205
this.repositoriesService = repositoriesService;
@@ -199,6 +211,8 @@ public RestoreService(
199211
}
200212
this.clusterSettings = clusterService.getClusterSettings();
201213
this.shardLimitValidator = shardLimitValidator;
214+
this.indicesService = indicesService;
215+
this.clusterInfoService = clusterInfoService;
202216

203217
// Task is onboarded for throttling, it will get retried from associated TransportClusterManagerNodeAction.
204218
restoreSnapshotTaskKey = clusterService.registerClusterManagerTask(ClusterManagerTaskKeys.RESTORE_SNAPSHOT_KEY, true);
@@ -401,7 +415,6 @@ public ClusterManagerTaskThrottler.ThrottlingKey getClusterManagerThrottlingKey(
401415

402416
@Override
403417
public ClusterState execute(ClusterState currentState) {
404-
RestoreInProgress restoreInProgress = currentState.custom(RestoreInProgress.TYPE, RestoreInProgress.EMPTY);
405418
// Check if the snapshot to restore is currently being deleted
406419
SnapshotDeletionsInProgress deletionsInProgress = currentState.custom(
407420
SnapshotDeletionsInProgress.TYPE,
@@ -423,6 +436,7 @@ public ClusterState execute(ClusterState currentState) {
423436
RoutingTable.Builder rtBuilder = RoutingTable.builder(currentState.routingTable());
424437
final Map<ShardId, RestoreInProgress.ShardRestoreStatus> shards;
425438
Set<String> aliases = new HashSet<>();
439+
long totalRestorableRemoteIndexesSize = 0;
426440

427441
if (indices.isEmpty() == false) {
428442
// We have some indices to restore
@@ -433,17 +447,14 @@ public ClusterState execute(ClusterState currentState) {
433447
String index = indexEntry.getValue();
434448
boolean partial = checkPartial(index);
435449

450+
IndexId snapshotIndexId = repositoryData.resolveIndexId(index);
436451
IndexMetadata snapshotIndexMetadata = updateIndexSettings(
437452
metadata.index(index),
438453
request.indexSettings(),
439454
request.ignoreIndexSettings()
440455
);
441456
if (IndexModule.Type.REMOTE_SNAPSHOT.match(request.storageType().toString())) {
442-
snapshotIndexMetadata = addSnapshotToIndexSettings(
443-
snapshotIndexMetadata,
444-
snapshot,
445-
repositoryData.resolveIndexId(index)
446-
);
457+
snapshotIndexMetadata = addSnapshotToIndexSettings(snapshotIndexMetadata, snapshot, snapshotIndexId);
447458
}
448459
final boolean isSearchableSnapshot = IndexModule.Type.REMOTE_SNAPSHOT.match(
449460
snapshotIndexMetadata.getSettings().get(IndexModule.INDEX_STORE_TYPE_SETTING.getKey())
@@ -469,7 +480,7 @@ public ClusterState execute(ClusterState currentState) {
469480
restoreUUID,
470481
snapshot,
471482
snapshotInfo.version(),
472-
repositoryData.resolveIndexId(index),
483+
snapshotIndexId,
473484
isSearchableSnapshot,
474485
isRemoteStoreShallowCopy,
475486
request.getSourceRemoteStoreRepository()
@@ -588,6 +599,14 @@ public ClusterState execute(ClusterState currentState) {
588599
}
589600

590601
for (int shard = 0; shard < snapshotIndexMetadata.getNumberOfShards(); shard++) {
602+
if (IndexModule.Type.REMOTE_SNAPSHOT.match(request.storageType().toString())) {
603+
IndexShardSnapshotStatus.Copy shardStatus = repository.getShardSnapshotStatus(
604+
snapshotInfo.snapshotId(),
605+
snapshotIndexId,
606+
new ShardId(metadata.index(index).getIndex(), shard)
607+
).asCopy();
608+
totalRestorableRemoteIndexesSize += shardStatus.getTotalSize();
609+
}
591610
if (!ignoreShards.contains(shard)) {
592611
shardsBuilder.put(
593612
new ShardId(renamedIndex, shard),
@@ -624,6 +643,9 @@ public ClusterState execute(ClusterState currentState) {
624643
}
625644

626645
checkAliasNameConflicts(indices, aliases);
646+
if (IndexModule.Type.REMOTE_SNAPSHOT.match(request.storageType().toString())) {
647+
validateSearchableSnapshotRestorable(totalRestorableRemoteIndexesSize);
648+
}
627649

628650
Map<String, DataStream> updatedDataStreams = new HashMap<>(currentState.metadata().dataStreams());
629651
updatedDataStreams.putAll(
@@ -823,6 +845,43 @@ private IndexMetadata updateIndexSettings(
823845
return builder.settings(settingsBuilder).build();
824846
}
825847

848+
private void validateSearchableSnapshotRestorable(long totalRestorableRemoteIndexesSize) {
849+
ClusterInfo clusterInfo = clusterInfoService.getClusterInfo();
850+
int remoteDataToFileCacheRatio = DATA_TO_FILE_CACHE_SIZE_RATIO_SETTING.get(clusterService.getSettings());
851+
Map<String, FileCacheStats> nodeFileCacheStats = clusterInfo.getNodeFileCacheStats();
852+
if (nodeFileCacheStats.isEmpty() || remoteDataToFileCacheRatio <= 0) {
853+
return;
854+
}
855+
856+
long totalNodeFileCacheSize = clusterInfo.getNodeFileCacheStats()
857+
.values()
858+
.stream()
859+
.map(fileCacheStats -> fileCacheStats.getTotal().getBytes())
860+
.mapToLong(Long::longValue)
861+
.sum();
862+
863+
Predicate<ShardRouting> shardRoutingPredicate = shardRouting -> shardRouting.primary()
864+
&& indicesService.indexService(shardRouting.index()).getIndexSettings().isRemoteSnapshot();
865+
866+
ShardsIterator shardsIterator = clusterService.state()
867+
.routingTable()
868+
.allShardsSatisfyingPredicate(shardRoutingPredicate);
869+
870+
long totalRestoredRemoteIndexesSize = shardsIterator.getShardRoutings()
871+
.stream()
872+
.map(clusterInfo::getShardSize)
873+
.mapToLong(Long::longValue)
874+
.sum();
875+
876+
if (totalRestoredRemoteIndexesSize + totalRestorableRemoteIndexesSize > remoteDataToFileCacheRatio
877+
* totalNodeFileCacheSize) {
878+
throw new SnapshotRestoreException(
879+
snapshot,
880+
"Size of the indexes to be restored exceed the file cache bounds. Increase the file cache capacity on the cluster."
881+
);
882+
}
883+
}
884+
826885
@Override
827886
public void onFailure(String source, Exception e) {
828887
logger.warn(() -> new ParameterizedMessage("[{}] failed to restore snapshot", snapshotId), e);

server/src/test/java/org/opensearch/action/admin/cluster/snapshots/restore/RestoreSnapshotRequestTests.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,10 @@ private RestoreSnapshotRequest randomState(RestoreSnapshotRequest instance) {
112112
instance.snapshotUuid(randomBoolean() ? null : randomAlphaOfLength(10));
113113
}
114114

115+
instance.storageType(
116+
randomBoolean() ? RestoreSnapshotRequest.StorageType.LOCAL : RestoreSnapshotRequest.StorageType.REMOTE_SNAPSHOT
117+
);
118+
115119
if (randomBoolean()) {
116120
instance.setSourceRemoteStoreRepository(randomAlphaOfLengthBetween(5, 10));
117121
}

server/src/test/java/org/opensearch/cluster/routing/RoutingTableTests.java

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,49 @@ public void testShardsMatchingPredicateCount() {
228228
assertThat(clusterState.routingTable().shardsMatchingPredicateCount(predicate), is(2));
229229
}
230230

231+
public void testAllShardsMatchingPredicate() {
232+
MockAllocationService allocation = createAllocationService(Settings.EMPTY, new DelayedShardsMockGatewayAllocator());
233+
Metadata metadata = Metadata.builder()
234+
.put(IndexMetadata.builder("test1").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(1))
235+
.put(IndexMetadata.builder("test2").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(1))
236+
.build();
237+
ClusterState clusterState = ClusterState.builder(org.opensearch.cluster.ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY))
238+
.metadata(metadata)
239+
.routingTable(RoutingTable.builder().addAsNew(metadata.index("test1")).addAsNew(metadata.index("test2")).build())
240+
.build();
241+
clusterState = ClusterState.builder(clusterState)
242+
.nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2")))
243+
.build();
244+
clusterState = allocation.reroute(clusterState, "reroute");
245+
246+
Predicate<ShardRouting> predicate = s -> s.state() == ShardRoutingState.UNASSIGNED && s.unassignedInfo().isDelayed();
247+
assertThat(clusterState.routingTable().allShardsSatisfyingPredicate(predicate).size(), is(0));
248+
249+
// starting primaries
250+
clusterState = startInitializingShardsAndReroute(allocation, clusterState);
251+
// starting replicas
252+
clusterState = startInitializingShardsAndReroute(allocation, clusterState);
253+
// remove node2 and reroute
254+
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder(clusterState.nodes()).remove("node2")).build();
255+
// make sure both replicas are marked as delayed (i.e. not reallocated)
256+
clusterState = allocation.disassociateDeadNodes(clusterState, true, "reroute");
257+
assertThat(clusterState.routingTable().allShardsSatisfyingPredicate(predicate).size(), is(2));
258+
259+
// Verifies true against all shards on the node (active/inactive)
260+
assertThat(clusterState.routingTable().allShardsSatisfyingPredicate(shard -> true).size(), is(4));
261+
// Verifies false against all shards on the node (active/inactive)
262+
assertThat(clusterState.routingTable().allShardsSatisfyingPredicate(shard -> false).size(), is(0));
263+
// Verifies against all primary shards on the node
264+
assertThat(clusterState.routingTable().allShardsSatisfyingPredicate(ShardRouting::primary).size(), is(2));
265+
// Verifies a predicate which tests for inactive replicas
266+
assertThat(
267+
clusterState.routingTable()
268+
.allShardsSatisfyingPredicate(shardRouting -> !shardRouting.primary() && !shardRouting.active())
269+
.size(),
270+
is(2)
271+
);
272+
}
273+
231274
public void testActivePrimaryShardsGrouped() {
232275
assertThat(this.emptyRoutingTable.activePrimaryShardsGrouped(new String[0], true).size(), is(0));
233276
assertThat(this.emptyRoutingTable.activePrimaryShardsGrouped(new String[0], false).size(), is(0));

0 commit comments

Comments
 (0)