Skip to content

Commit 2debaed

Browse files
committed
Address review feedback: guard negative values, prevent resource leak
- Clamp CPU probe values to 0.0 when platform returns -1 (unavailable) - Clamp memory pool max to 0.0 when pool has no defined upper bound - Wrap gauge registration in try-catch to close already-created handles if construction fails partway through - Add CHANGELOG entry for PR #20844 - Add tests for negative CPU guard and constructor cleanup on failure Signed-off-by: Sam Akrah <sakrah@uber.com> Made-with: Cursor Signed-off-by: Sam Akrah <sakrah@uber.com> Made-with: Cursor
1 parent d0a1714 commit 2debaed

4 files changed

Lines changed: 433 additions & 134 deletions

File tree

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
3939
- Implement FieldMappingIngestionMessageMapper for pull-based ingestion ([#20729](https://github.com/opensearch-project/OpenSearch/pull/20729))
4040
- Added support of WarmerRefreshListener in NRTReplicationEngine to trigger warmer after replication on replica shards ([#20650](https://github.com/opensearch-project/OpenSearch/pull/20650))
4141
- WLM group custom search settings - groundwork and timeout ([#20536](https://github.com/opensearch-project/OpenSearch/issues/20536))
42+
- Expose JVM runtime metrics via telemetry framework ([#20844](https://github.com/opensearch-project/OpenSearch/pull/20844))
4243

4344
### Changed
4445
- Make telemetry `Tags` immutable ([#20788](https://github.com/opensearch-project/OpenSearch/pull/20788))

server/src/main/java/org/opensearch/monitor/NodeRuntimeMetrics.java

Lines changed: 195 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -100,18 +100,28 @@ public NodeRuntimeMetrics(MetricsRegistry registry, JvmService jvmService, Proce
100100
this(registry, jvmService, processProbe, osProbe, ManagementFactory.getThreadMXBean());
101101
}
102102

103-
NodeRuntimeMetrics(MetricsRegistry registry, JvmService jvmService, ProcessProbe processProbe, OsProbe osProbe,
104-
ThreadMXBean threadMXBean) {
103+
NodeRuntimeMetrics(
104+
MetricsRegistry registry,
105+
JvmService jvmService,
106+
ProcessProbe processProbe,
107+
OsProbe osProbe,
108+
ThreadMXBean threadMXBean
109+
) {
105110
this.jvmService = jvmService;
106111
this.threadMXBean = threadMXBean;
107112

108-
registerMemoryGauges(registry);
109-
registerGcGauges(registry);
110-
registerBufferPoolGauges(registry);
111-
registerThreadGauges(registry);
112-
registerClassGauges(registry);
113-
registerUptimeGauge(registry);
114-
registerCpuGauges(registry, processProbe, osProbe);
113+
try {
114+
registerMemoryGauges(registry);
115+
registerGcGauges(registry);
116+
registerBufferPoolGauges(registry);
117+
registerThreadGauges(registry);
118+
registerClassGauges(registry);
119+
registerUptimeGauge(registry);
120+
registerCpuGauges(registry, processProbe, osProbe);
121+
} catch (Exception e) {
122+
closeQuietly();
123+
throw e;
124+
}
115125

116126
logger.debug("Registered {} node runtime metric gauges", gaugeHandles.size());
117127
}
@@ -122,43 +132,62 @@ private void registerMemoryGauges(MetricsRegistry registry) {
122132
Tags heapTags = Tags.of(TAG_TYPE, "heap");
123133
Tags nonHeapTags = Tags.of(TAG_TYPE, "non_heap");
124134

125-
gaugeHandles.add(registry.createGauge(
126-
JVM_MEMORY_USED, "JVM heap memory used", UNIT_BYTES,
127-
() -> (double) jvmService.stats().getMem().getHeapUsed().getBytes(), heapTags
128-
));
129-
gaugeHandles.add(registry.createGauge(
130-
JVM_MEMORY_COMMITTED, "JVM heap memory committed", UNIT_BYTES,
131-
() -> (double) jvmService.stats().getMem().getHeapCommitted().getBytes(), heapTags
132-
));
133-
gaugeHandles.add(registry.createGauge(
134-
JVM_MEMORY_LIMIT, "JVM heap memory max", UNIT_BYTES,
135-
() -> (double) jvmService.stats().getMem().getHeapMax().getBytes(), heapTags
136-
));
137-
gaugeHandles.add(registry.createGauge(
138-
JVM_MEMORY_USED, "JVM non-heap memory used", UNIT_BYTES,
139-
() -> (double) jvmService.stats().getMem().getNonHeapUsed().getBytes(), nonHeapTags
140-
));
135+
gaugeHandles.add(
136+
registry.createGauge(
137+
JVM_MEMORY_USED,
138+
"JVM heap memory used",
139+
UNIT_BYTES,
140+
() -> (double) jvmService.stats().getMem().getHeapUsed().getBytes(),
141+
heapTags
142+
)
143+
);
144+
gaugeHandles.add(
145+
registry.createGauge(
146+
JVM_MEMORY_COMMITTED,
147+
"JVM heap memory committed",
148+
UNIT_BYTES,
149+
() -> (double) jvmService.stats().getMem().getHeapCommitted().getBytes(),
150+
heapTags
151+
)
152+
);
153+
gaugeHandles.add(
154+
registry.createGauge(
155+
JVM_MEMORY_LIMIT,
156+
"JVM heap memory max",
157+
UNIT_BYTES,
158+
() -> (double) jvmService.stats().getMem().getHeapMax().getBytes(),
159+
heapTags
160+
)
161+
);
162+
gaugeHandles.add(
163+
registry.createGauge(
164+
JVM_MEMORY_USED,
165+
"JVM non-heap memory used",
166+
UNIT_BYTES,
167+
() -> (double) jvmService.stats().getMem().getNonHeapUsed().getBytes(),
168+
nonHeapTags
169+
)
170+
);
141171

142172
// Per-pool gauges
143173
JvmStats stats = jvmService.stats();
144174
for (JvmStats.MemoryPool pool : stats.getMem()) {
145175
String poolName = pool.getName();
146176
Tags poolTags = Tags.of(TAG_POOL, poolName);
147-
gaugeHandles.add(registry.createGauge(
148-
JVM_MEMORY_USED, "JVM memory pool used", UNIT_BYTES,
149-
() -> poolBytes(poolName, true), poolTags
150-
));
151-
gaugeHandles.add(registry.createGauge(
152-
JVM_MEMORY_LIMIT, "JVM memory pool max", UNIT_BYTES,
153-
() -> poolBytes(poolName, false), poolTags
154-
));
177+
gaugeHandles.add(
178+
registry.createGauge(JVM_MEMORY_USED, "JVM memory pool used", UNIT_BYTES, () -> poolBytes(poolName, true), poolTags)
179+
);
180+
gaugeHandles.add(
181+
registry.createGauge(JVM_MEMORY_LIMIT, "JVM memory pool max", UNIT_BYTES, () -> poolBytes(poolName, false), poolTags)
182+
);
155183
}
156184
}
157185

158186
private double poolBytes(String poolName, boolean used) {
159187
for (JvmStats.MemoryPool pool : jvmService.stats().getMem()) {
160188
if (pool.getName().equals(poolName)) {
161-
return (double) (used ? pool.getUsed() : pool.getMax()).getBytes();
189+
long bytes = (used ? pool.getUsed() : pool.getMax()).getBytes();
190+
return bytes < 0 ? 0.0 : (double) bytes;
162191
}
163192
}
164193
return 0;
@@ -174,14 +203,18 @@ private void registerGcGauges(MetricsRegistry registry) {
174203
for (JvmStats.GarbageCollector gc : stats.getGc()) {
175204
String collectorName = gc.getName();
176205
Tags gcTags = Tags.of(TAG_GC, collectorName);
177-
gaugeHandles.add(registry.createGauge(
178-
JVM_GC_DURATION, "GC cumulative collection time", UNIT_SECONDS,
179-
() -> gcMetric(collectorName, false), gcTags
180-
));
181-
gaugeHandles.add(registry.createGauge(
182-
JVM_GC_COUNT, "GC collection count", UNIT_1,
183-
() -> gcMetric(collectorName, true), gcTags
184-
));
206+
gaugeHandles.add(
207+
registry.createGauge(
208+
JVM_GC_DURATION,
209+
"GC cumulative collection time",
210+
UNIT_SECONDS,
211+
() -> gcMetric(collectorName, false),
212+
gcTags
213+
)
214+
);
215+
gaugeHandles.add(
216+
registry.createGauge(JVM_GC_COUNT, "GC collection count", UNIT_1, () -> gcMetric(collectorName, true), gcTags)
217+
);
185218
}
186219
}
187220

@@ -201,31 +234,54 @@ private void registerBufferPoolGauges(MetricsRegistry registry) {
201234
for (JvmStats.BufferPool bp : stats.getBufferPools()) {
202235
String bpName = bp.getName();
203236
Tags bpTags = Tags.of(TAG_POOL, bpName);
204-
gaugeHandles.add(registry.createGauge(
205-
JVM_BUFFER_MEMORY_USED, "Buffer pool memory used", UNIT_BYTES,
206-
() -> bufferPoolMetric(bpName, BufferPoolField.USED), bpTags
207-
));
208-
gaugeHandles.add(registry.createGauge(
209-
JVM_BUFFER_MEMORY_LIMIT, "Buffer pool total capacity", UNIT_BYTES,
210-
() -> bufferPoolMetric(bpName, BufferPoolField.LIMIT), bpTags
211-
));
212-
gaugeHandles.add(registry.createGauge(
213-
JVM_BUFFER_COUNT, "Buffer pool buffer count", UNIT_1,
214-
() -> bufferPoolMetric(bpName, BufferPoolField.COUNT), bpTags
215-
));
237+
gaugeHandles.add(
238+
registry.createGauge(
239+
JVM_BUFFER_MEMORY_USED,
240+
"Buffer pool memory used",
241+
UNIT_BYTES,
242+
() -> bufferPoolMetric(bpName, BufferPoolField.USED),
243+
bpTags
244+
)
245+
);
246+
gaugeHandles.add(
247+
registry.createGauge(
248+
JVM_BUFFER_MEMORY_LIMIT,
249+
"Buffer pool total capacity",
250+
UNIT_BYTES,
251+
() -> bufferPoolMetric(bpName, BufferPoolField.LIMIT),
252+
bpTags
253+
)
254+
);
255+
gaugeHandles.add(
256+
registry.createGauge(
257+
JVM_BUFFER_COUNT,
258+
"Buffer pool buffer count",
259+
UNIT_1,
260+
() -> bufferPoolMetric(bpName, BufferPoolField.COUNT),
261+
bpTags
262+
)
263+
);
216264
}
217265
}
218266

219-
private enum BufferPoolField { USED, LIMIT, COUNT }
267+
private enum BufferPoolField {
268+
USED,
269+
LIMIT,
270+
COUNT
271+
}
220272

221273
private double bufferPoolMetric(String bpName, BufferPoolField field) {
222274
for (JvmStats.BufferPool bp : jvmService.stats().getBufferPools()) {
223275
if (bp.getName().equals(bpName)) {
224276
switch (field) {
225-
case USED: return (double) bp.getUsed().getBytes();
226-
case LIMIT: return (double) bp.getTotalCapacity().getBytes();
227-
case COUNT: return (double) bp.getCount();
228-
default: return 0;
277+
case USED:
278+
return (double) bp.getUsed().getBytes();
279+
case LIMIT:
280+
return (double) bp.getTotalCapacity().getBytes();
281+
case COUNT:
282+
return (double) bp.getCount();
283+
default:
284+
return 0;
229285
}
230286
}
231287
}
@@ -235,18 +291,28 @@ private double bufferPoolMetric(String bpName, BufferPoolField field) {
235291
// ---- Threads ----
236292

237293
private void registerThreadGauges(MetricsRegistry registry) {
238-
gaugeHandles.add(registry.createGauge(
239-
JVM_THREAD_COUNT, "JVM thread count", UNIT_1,
240-
() -> (double) jvmService.stats().getThreads().getCount(), Tags.EMPTY
241-
));
294+
gaugeHandles.add(
295+
registry.createGauge(
296+
JVM_THREAD_COUNT,
297+
"JVM thread count",
298+
UNIT_1,
299+
() -> (double) jvmService.stats().getThreads().getCount(),
300+
Tags.EMPTY
301+
)
302+
);
242303

243304
for (Thread.State state : Thread.State.values()) {
244305
String stateName = state.name().toLowerCase(Locale.ROOT);
245306
Tags stateTags = Tags.of(TAG_STATE, stateName);
246-
gaugeHandles.add(registry.createGauge(
247-
JVM_THREAD_COUNT, "JVM threads in this state", UNIT_1,
248-
() -> (double) getThreadStateCount(state), stateTags
249-
));
307+
gaugeHandles.add(
308+
registry.createGauge(
309+
JVM_THREAD_COUNT,
310+
"JVM threads in this state",
311+
UNIT_1,
312+
() -> (double) getThreadStateCount(state),
313+
stateTags
314+
)
315+
);
250316
}
251317
}
252318

@@ -283,47 +349,85 @@ private void refreshThreadStateCounts() {
283349
// ---- Classes ----
284350

285351
private void registerClassGauges(MetricsRegistry registry) {
286-
gaugeHandles.add(registry.createGauge(
287-
JVM_CLASS_COUNT, "Currently loaded class count", UNIT_1,
288-
() -> (double) jvmService.stats().getClasses().getLoadedClassCount(), Tags.EMPTY
289-
));
290-
gaugeHandles.add(registry.createGauge(
291-
JVM_CLASS_LOADED, "Total loaded class count since JVM start", UNIT_1,
292-
() -> (double) jvmService.stats().getClasses().getTotalLoadedClassCount(), Tags.EMPTY
293-
));
294-
gaugeHandles.add(registry.createGauge(
295-
JVM_CLASS_UNLOADED, "Total unloaded class count since JVM start", UNIT_1,
296-
() -> (double) jvmService.stats().getClasses().getUnloadedClassCount(), Tags.EMPTY
297-
));
352+
gaugeHandles.add(
353+
registry.createGauge(
354+
JVM_CLASS_COUNT,
355+
"Currently loaded class count",
356+
UNIT_1,
357+
() -> (double) jvmService.stats().getClasses().getLoadedClassCount(),
358+
Tags.EMPTY
359+
)
360+
);
361+
gaugeHandles.add(
362+
registry.createGauge(
363+
JVM_CLASS_LOADED,
364+
"Total loaded class count since JVM start",
365+
UNIT_1,
366+
() -> (double) jvmService.stats().getClasses().getTotalLoadedClassCount(),
367+
Tags.EMPTY
368+
)
369+
);
370+
gaugeHandles.add(
371+
registry.createGauge(
372+
JVM_CLASS_UNLOADED,
373+
"Total unloaded class count since JVM start",
374+
UNIT_1,
375+
() -> (double) jvmService.stats().getClasses().getUnloadedClassCount(),
376+
Tags.EMPTY
377+
)
378+
);
298379
}
299380

300381
// ---- Uptime ----
301382

302383
private void registerUptimeGauge(MetricsRegistry registry) {
303-
gaugeHandles.add(registry.createGauge(
304-
JVM_UPTIME, "JVM uptime", UNIT_SECONDS,
305-
() -> jvmService.stats().getUptime().getMillis() / 1000.0, Tags.EMPTY
306-
));
384+
gaugeHandles.add(
385+
registry.createGauge(
386+
JVM_UPTIME,
387+
"JVM uptime",
388+
UNIT_SECONDS,
389+
() -> jvmService.stats().getUptime().getMillis() / 1000.0,
390+
Tags.EMPTY
391+
)
392+
);
307393
}
308394

309395
// ---- CPU ----
310396

311397
private void registerCpuGauges(MetricsRegistry registry, ProcessProbe processProbe, OsProbe osProbe) {
312-
gaugeHandles.add(registry.createGauge(
313-
JVM_CPU_RECENT_UTILIZATION, "Recent JVM CPU utilization", UNIT_1,
314-
() -> processProbe.getProcessCpuPercent() / 100.0, Tags.EMPTY
315-
));
316-
gaugeHandles.add(registry.createGauge(
317-
JVM_SYSTEM_CPU_UTILIZATION, "System CPU utilization", UNIT_1,
318-
() -> osProbe.getSystemCpuPercent() / 100.0, Tags.EMPTY
319-
));
398+
gaugeHandles.add(
399+
registry.createGauge(
400+
JVM_CPU_RECENT_UTILIZATION,
401+
"Recent JVM CPU utilization",
402+
UNIT_1,
403+
() -> clampCpuPercent(processProbe.getProcessCpuPercent()),
404+
Tags.EMPTY
405+
)
406+
);
407+
gaugeHandles.add(
408+
registry.createGauge(
409+
JVM_SYSTEM_CPU_UTILIZATION,
410+
"System CPU utilization",
411+
UNIT_1,
412+
() -> clampCpuPercent(osProbe.getSystemCpuPercent()),
413+
Tags.EMPTY
414+
)
415+
);
416+
}
417+
418+
private static double clampCpuPercent(short pct) {
419+
return pct < 0 ? 0.0 : pct / 100.0;
320420
}
321421

322422
@Override
323423
public void close() throws IOException {
324424
if (closed.compareAndSet(false, true) == false) {
325425
return;
326426
}
427+
closeQuietly();
428+
}
429+
430+
private void closeQuietly() {
327431
for (Closeable handle : gaugeHandles) {
328432
try {
329433
handle.close();

server/src/main/java/org/opensearch/node/Node.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,12 +200,12 @@
200200
import org.opensearch.ingest.SystemIngestPipelineCache;
201201
import org.opensearch.monitor.MonitorService;
202202
import org.opensearch.monitor.NodeRuntimeMetrics;
203-
import org.opensearch.monitor.os.OsProbe;
204-
import org.opensearch.monitor.process.ProcessProbe;
205203
import org.opensearch.monitor.fs.FsHealthService;
206204
import org.opensearch.monitor.fs.FsProbe;
207205
import org.opensearch.monitor.fs.FsServiceProvider;
208206
import org.opensearch.monitor.jvm.JvmInfo;
207+
import org.opensearch.monitor.os.OsProbe;
208+
import org.opensearch.monitor.process.ProcessProbe;
209209
import org.opensearch.node.remotestore.RemoteStoreNodeService;
210210
import org.opensearch.node.remotestore.RemoteStorePinnedTimestampService;
211211
import org.opensearch.node.resource.tracker.NodeResourceUsageTracker;

0 commit comments

Comments
 (0)