From 61c1d0843593be15d2044fb83efd1f6f0258b9f8 Mon Sep 17 00:00:00 2001 From: Michael Noukhovitch Date: Mon, 13 Apr 2026 16:18:04 -0400 Subject: [PATCH 1/3] Reduce repeated low disk space alerts --- open_instruct/test_utils.py | 28 ++++++++++++++++++++++++++++ open_instruct/utils.py | 10 +++++++++- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/open_instruct/test_utils.py b/open_instruct/test_utils.py index 8aa7249284..d88d4963e8 100644 --- a/open_instruct/test_utils.py +++ b/open_instruct/test_utils.py @@ -311,6 +311,9 @@ def test_send_slack_message_with_beaker_url(self, mock_environ_get, mock_get_bea class TestWarnIfLowDiskSpace(unittest.TestCase): + def setUp(self): + utils._WARNED_LOW_DISK_SPACE_PATHS.clear() + @parameterized.expand( [ ("gcs", "gs://bucket/path"), @@ -340,6 +343,15 @@ def test_warning_above_threshold(self, mock_disk_usage): mock_warning.assert_called_once() self.assertIn("90.0%", mock_warning.call_args[0][0]) + @mock.patch("shutil.disk_usage") + def test_warning_only_logged_once_per_path(self, mock_disk_usage): + mock_disk_usage.return_value = mock.Mock(total=100 * 1024**3, used=90 * 1024**3, free=10 * 1024**3) + with mock.patch.object(utils.logger, "warning") as mock_warning: + utils.warn_if_low_disk_space("/tmp/test", threshold=0.85) + utils.warn_if_low_disk_space("/tmp/test", threshold=0.85) + utils.warn_if_low_disk_space("/tmp/../tmp/test", threshold=0.85) + mock_warning.assert_called_once() + @responses.activate @mock.patch("shutil.disk_usage") @mock.patch("open_instruct.utils.get_beaker_experiment_url") @@ -357,6 +369,22 @@ def test_slack_alert_sent_when_enabled(self, mock_environ_get, mock_get_beaker_u request_body = json.loads(responses.calls[0].request.body) self.assertIn("Disk usage near capacity", request_body["text"]) + @responses.activate + @mock.patch("shutil.disk_usage") + @mock.patch("open_instruct.utils.get_beaker_experiment_url") + @mock.patch("os.environ.get") + def test_slack_alert_sent_once_per_path(self, mock_environ_get, mock_get_beaker_url, mock_disk_usage): + webhook_url = "https://hooks.slack.com/services/test" + mock_environ_get.return_value = webhook_url + mock_get_beaker_url.return_value = None + mock_disk_usage.return_value = mock.Mock(total=100 * 1024**3, used=90 * 1024**3, free=10 * 1024**3) + responses.add(responses.POST, webhook_url, json={"ok": True}, status=200) + + utils.warn_if_low_disk_space("/tmp/test", send_slack_alerts=True) + utils.warn_if_low_disk_space("/tmp/test", send_slack_alerts=True) + + self.assertEqual(len(responses.calls), 1) + @mock.patch("shutil.disk_usage") def test_zero_total_disk_space_returns_early(self, mock_disk_usage): mock_disk_usage.return_value = mock.Mock(total=0, used=0, free=0) diff --git a/open_instruct/utils.py b/open_instruct/utils.py index 98e7ebab26..9dc263fd77 100644 --- a/open_instruct/utils.py +++ b/open_instruct/utils.py @@ -81,6 +81,8 @@ DISK_USAGE_WARNING_THRESHOLD = 0.85 CLOUD_PATH_PREFIXES = ("gs://", "s3://", "az://", "hdfs://", "/filestore") INVALID_LOGPROB = 1.0 # Sentinel value for masked/invalid log probabilities +_WARNED_LOW_DISK_SPACE_PATHS: set[str] = set() +_WARNED_LOW_DISK_SPACE_PATHS_LOCK = threading.Lock() logger = logger_utils.setup_logger(__name__) @@ -125,13 +127,19 @@ def warn_if_low_disk_space( if usage.total == 0: return + normalized_path = os.path.abspath(path) used_ratio = usage.used / usage.total if used_ratio >= threshold: + with _WARNED_LOW_DISK_SPACE_PATHS_LOCK: + if normalized_path in _WARNED_LOW_DISK_SPACE_PATHS: + return + _WARNED_LOW_DISK_SPACE_PATHS.add(normalized_path) + used_percent = used_ratio * 100 free_gib = usage.free / (1024**3) total_gib = usage.total / (1024**3) warning_message = ( - f"Disk usage near capacity for {path}: {used_percent:.1f}% used " + f"Disk usage near capacity for {normalized_path}: {used_percent:.1f}% used " f"({free_gib:.1f} GiB free of {total_gib:.1f} GiB). Checkpointing may fail." ) logger.warning(warning_message) From 96c9990a42bcdab5bc4176f7154470eb6b8176e2 Mon Sep 17 00:00:00 2001 From: Michael Noukhovitch Date: Mon, 13 Apr 2026 16:18:51 -0400 Subject: [PATCH 2/3] Add changelog entry for low disk space alert fix --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e01e2e1d8..2be8222498 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,7 @@ All notable changes to this project will be documented in this file. - Add deprecation warning to `finetune.py` pointing users to the OLMo-core SFT implementation (https://github.com/allenai/open-instruct/pull/1574). ### Fixed +- Reduce repeated low disk space send-alert warnings so each run only alerts once per checkpoint path (https://github.com/allenai/open-instruct/pull/1608). - Fix `Batch.__getitem__` handling of `active_tools` for int and list indexing (https://github.com/allenai/open-instruct/pull/1592). - Fix `RepeatPhraseChecker.check_following` to validate all matched phrases differ by exactly one word and return a proper boolean instead of `None` (https://github.com/allenai/open-instruct/pull/1044). - Fix incorrect hardcoded checkpoint state path for multi-GPU DeepSpeed resumption (https://github.com/allenai/open-instruct/pull/1589). From 994235802bd726c2e052290e7c46d5953dbbe8ff Mon Sep 17 00:00:00 2001 From: Michael Noukhovitch Date: Mon, 13 Apr 2026 16:23:04 -0400 Subject: [PATCH 3/3] Warn about checkpoint disk space only once per run --- open_instruct/grpo_fast.py | 5 ++++- open_instruct/test_utils.py | 28 ---------------------------- open_instruct/utils.py | 10 +--------- 3 files changed, 5 insertions(+), 38 deletions(-) diff --git a/open_instruct/grpo_fast.py b/open_instruct/grpo_fast.py index 012e1f2507..39f0f5ad09 100644 --- a/open_instruct/grpo_fast.py +++ b/open_instruct/grpo_fast.py @@ -1994,6 +1994,7 @@ def health_check_fn(): wandb_url=wandb_url, ) last_eval_collected = True + has_warned_about_checkpoint_disk_space = False for training_step in range(resume_training_step, args.num_training_steps + 1): start_time = time.perf_counter() @@ -2062,7 +2063,9 @@ def health_check_fn(): and training_step % args.checkpoint_state_freq == 0 and args.checkpoint_state_dir is not None ): - utils.warn_if_low_disk_space(args.checkpoint_state_dir, send_slack_alerts=args.send_slack_alerts) + if not has_warned_about_checkpoint_disk_space: + utils.warn_if_low_disk_space(args.checkpoint_state_dir, send_slack_alerts=args.send_slack_alerts) + has_warned_about_checkpoint_disk_space = True with Timer("[Main Thread] 🗡️ Saving checkpoint state"): # Save comprehensive client state including dataloader state client_state = { diff --git a/open_instruct/test_utils.py b/open_instruct/test_utils.py index d88d4963e8..8aa7249284 100644 --- a/open_instruct/test_utils.py +++ b/open_instruct/test_utils.py @@ -311,9 +311,6 @@ def test_send_slack_message_with_beaker_url(self, mock_environ_get, mock_get_bea class TestWarnIfLowDiskSpace(unittest.TestCase): - def setUp(self): - utils._WARNED_LOW_DISK_SPACE_PATHS.clear() - @parameterized.expand( [ ("gcs", "gs://bucket/path"), @@ -343,15 +340,6 @@ def test_warning_above_threshold(self, mock_disk_usage): mock_warning.assert_called_once() self.assertIn("90.0%", mock_warning.call_args[0][0]) - @mock.patch("shutil.disk_usage") - def test_warning_only_logged_once_per_path(self, mock_disk_usage): - mock_disk_usage.return_value = mock.Mock(total=100 * 1024**3, used=90 * 1024**3, free=10 * 1024**3) - with mock.patch.object(utils.logger, "warning") as mock_warning: - utils.warn_if_low_disk_space("/tmp/test", threshold=0.85) - utils.warn_if_low_disk_space("/tmp/test", threshold=0.85) - utils.warn_if_low_disk_space("/tmp/../tmp/test", threshold=0.85) - mock_warning.assert_called_once() - @responses.activate @mock.patch("shutil.disk_usage") @mock.patch("open_instruct.utils.get_beaker_experiment_url") @@ -369,22 +357,6 @@ def test_slack_alert_sent_when_enabled(self, mock_environ_get, mock_get_beaker_u request_body = json.loads(responses.calls[0].request.body) self.assertIn("Disk usage near capacity", request_body["text"]) - @responses.activate - @mock.patch("shutil.disk_usage") - @mock.patch("open_instruct.utils.get_beaker_experiment_url") - @mock.patch("os.environ.get") - def test_slack_alert_sent_once_per_path(self, mock_environ_get, mock_get_beaker_url, mock_disk_usage): - webhook_url = "https://hooks.slack.com/services/test" - mock_environ_get.return_value = webhook_url - mock_get_beaker_url.return_value = None - mock_disk_usage.return_value = mock.Mock(total=100 * 1024**3, used=90 * 1024**3, free=10 * 1024**3) - responses.add(responses.POST, webhook_url, json={"ok": True}, status=200) - - utils.warn_if_low_disk_space("/tmp/test", send_slack_alerts=True) - utils.warn_if_low_disk_space("/tmp/test", send_slack_alerts=True) - - self.assertEqual(len(responses.calls), 1) - @mock.patch("shutil.disk_usage") def test_zero_total_disk_space_returns_early(self, mock_disk_usage): mock_disk_usage.return_value = mock.Mock(total=0, used=0, free=0) diff --git a/open_instruct/utils.py b/open_instruct/utils.py index 9dc263fd77..98e7ebab26 100644 --- a/open_instruct/utils.py +++ b/open_instruct/utils.py @@ -81,8 +81,6 @@ DISK_USAGE_WARNING_THRESHOLD = 0.85 CLOUD_PATH_PREFIXES = ("gs://", "s3://", "az://", "hdfs://", "/filestore") INVALID_LOGPROB = 1.0 # Sentinel value for masked/invalid log probabilities -_WARNED_LOW_DISK_SPACE_PATHS: set[str] = set() -_WARNED_LOW_DISK_SPACE_PATHS_LOCK = threading.Lock() logger = logger_utils.setup_logger(__name__) @@ -127,19 +125,13 @@ def warn_if_low_disk_space( if usage.total == 0: return - normalized_path = os.path.abspath(path) used_ratio = usage.used / usage.total if used_ratio >= threshold: - with _WARNED_LOW_DISK_SPACE_PATHS_LOCK: - if normalized_path in _WARNED_LOW_DISK_SPACE_PATHS: - return - _WARNED_LOW_DISK_SPACE_PATHS.add(normalized_path) - used_percent = used_ratio * 100 free_gib = usage.free / (1024**3) total_gib = usage.total / (1024**3) warning_message = ( - f"Disk usage near capacity for {normalized_path}: {used_percent:.1f}% used " + f"Disk usage near capacity for {path}: {used_percent:.1f}% used " f"({free_gib:.1f} GiB free of {total_gib:.1f} GiB). Checkpointing may fail." ) logger.warning(warning_message)