1313)
1414from ray ._private .grpc_utils import init_grpc_channel
1515from ray ._private .state_api_test_utils import verify_failed_task
16- from ray ._private .test_utils import raw_metrics
16+ from ray ._private .test_utils import PrometheusTimeseries , raw_metric_timeseries
1717from ray ._private .utils import get_used_memory
1818from ray .util .state .state_manager import StateDataSourceClient
1919
@@ -118,8 +118,10 @@ def get_additional_bytes_to_reach_memory_usage_pct(pct: float) -> int:
118118 return bytes_needed
119119
120120
121- def has_metric_tagged_with_value (addr , tag , value ) -> bool :
122- metrics = raw_metrics (addr )
121+ def has_metric_tagged_with_value (
122+ addr , tag , value , timeseries : PrometheusTimeseries
123+ ) -> bool :
124+ metrics = raw_metric_timeseries (addr , timeseries )
123125 for name , samples in metrics .items ():
124126 for sample in samples :
125127 if tag in set (sample .labels .values ()) and sample .value == value :
@@ -145,13 +147,15 @@ def test_restartable_actor_throws_oom_error(ray_with_memory_monitor, restartable
145147 with pytest .raises (ray .exceptions .OutOfMemoryError ):
146148 ray .get (leaker .allocate .remote (bytes_to_alloc , memory_monitor_refresh_ms * 3 ))
147149
150+ timeseries = PrometheusTimeseries ()
148151 wait_for_condition (
149152 has_metric_tagged_with_value ,
150153 timeout = 10 ,
151154 retry_interval_ms = 100 ,
152155 addr = addr ,
153156 tag = "MemoryManager.ActorEviction.Total" ,
154157 value = 2.0 if restartable else 1.0 ,
158+ timeseries = timeseries ,
155159 )
156160
157161 wait_for_condition (
@@ -161,6 +165,7 @@ def test_restartable_actor_throws_oom_error(ray_with_memory_monitor, restartable
161165 addr = addr ,
162166 tag = "Leaker.__init__" ,
163167 value = 2.0 if restartable else 1.0 ,
168+ timeseries = timeseries ,
164169 )
165170
166171
@@ -180,13 +185,15 @@ def test_restartable_actor_oom_retry_off_throws_oom_error(
180185 with pytest .raises (ray .exceptions .OutOfMemoryError ) as _ :
181186 ray .get (leaker .allocate .remote (bytes_to_alloc , memory_monitor_refresh_ms * 3 ))
182187
188+ timeseries = PrometheusTimeseries ()
183189 wait_for_condition (
184190 has_metric_tagged_with_value ,
185191 timeout = 10 ,
186192 retry_interval_ms = 100 ,
187193 addr = addr ,
188194 tag = "MemoryManager.ActorEviction.Total" ,
189195 value = 2.0 ,
196+ timeseries = timeseries ,
190197 )
191198 wait_for_condition (
192199 has_metric_tagged_with_value ,
@@ -195,6 +202,7 @@ def test_restartable_actor_oom_retry_off_throws_oom_error(
195202 addr = addr ,
196203 tag = "Leaker.__init__" ,
197204 value = 2.0 ,
205+ timeseries = timeseries ,
198206 )
199207
200208
@@ -210,13 +218,15 @@ def test_non_retryable_task_killed_by_memory_monitor_with_oom_error(
210218 with pytest .raises (ray .exceptions .OutOfMemoryError ) as _ :
211219 ray .get (allocate_memory .options (max_retries = 0 ).remote (bytes_to_alloc ))
212220
221+ timeseries = PrometheusTimeseries ()
213222 wait_for_condition (
214223 has_metric_tagged_with_value ,
215224 timeout = 10 ,
216225 retry_interval_ms = 100 ,
217226 addr = addr ,
218227 tag = "MemoryManager.TaskEviction.Total" ,
219228 value = 1.0 ,
229+ timeseries = timeseries ,
220230 )
221231 wait_for_condition (
222232 has_metric_tagged_with_value ,
@@ -225,6 +235,7 @@ def test_non_retryable_task_killed_by_memory_monitor_with_oom_error(
225235 addr = addr ,
226236 tag = "allocate_memory" ,
227237 value = 1.0 ,
238+ timeseries = timeseries ,
228239 )
229240
230241
@@ -372,13 +383,15 @@ def test_task_oom_no_oom_retry_fails_immediately(
372383 )
373384 )
374385
386+ timeseries = PrometheusTimeseries ()
375387 wait_for_condition (
376388 has_metric_tagged_with_value ,
377389 timeout = 10 ,
378390 retry_interval_ms = 100 ,
379391 addr = addr ,
380392 tag = "MemoryManager.TaskEviction.Total" ,
381393 value = 1.0 ,
394+ timeseries = timeseries ,
382395 )
383396 wait_for_condition (
384397 has_metric_tagged_with_value ,
@@ -387,6 +400,7 @@ def test_task_oom_no_oom_retry_fails_immediately(
387400 addr = addr ,
388401 tag = "allocate_memory" ,
389402 value = 1.0 ,
403+ timeseries = timeseries ,
390404 )
391405
392406
@@ -411,13 +425,15 @@ def test_task_oom_only_uses_oom_retry(
411425 )
412426 )
413427
428+ timeseries = PrometheusTimeseries ()
414429 wait_for_condition (
415430 has_metric_tagged_with_value ,
416431 timeout = 10 ,
417432 retry_interval_ms = 100 ,
418433 addr = addr ,
419434 tag = "MemoryManager.TaskEviction.Total" ,
420435 value = task_oom_retries + 1 ,
436+ timeseries = timeseries ,
421437 )
422438 wait_for_condition (
423439 has_metric_tagged_with_value ,
@@ -426,6 +442,7 @@ def test_task_oom_only_uses_oom_retry(
426442 addr = addr ,
427443 tag = "allocate_memory" ,
428444 value = task_oom_retries + 1 ,
445+ timeseries = timeseries ,
429446 )
430447
431448
@@ -502,6 +519,7 @@ def infinite_retry_task():
502519 time .sleep (5 )
503520
504521 with ray .init () as addr :
522+ timeseries = PrometheusTimeseries ()
505523 with pytest .raises (ray .exceptions .OutOfMemoryError ) as _ :
506524 ray .get (infinite_retry_task .remote ())
507525
@@ -512,6 +530,7 @@ def infinite_retry_task():
512530 addr = addr ,
513531 tag = "MemoryManager.TaskEviction.Total" ,
514532 value = 1.0 ,
533+ timeseries = timeseries ,
515534 )
516535
517536
0 commit comments