Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions .github/workflows/nightly-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ jobs:
timeout-minutes: 240
run: |
cd test/srt
python3 test_nightly_vlms_mmmu_eval.py
python3 test_nightly_vlms_mmmu_eval.py || python3 test_nightly_vlms_mmmu_eval.py || python3 test_nightly_vlms_mmmu_eval.py

nightly-test-perf-vlms:
if: github.repository == 'sgl-project/sglang'
Expand Down Expand Up @@ -125,7 +125,7 @@ jobs:
timeout-minutes: 60
run: |
cd test/srt
python3 run_suite.py --suite nightly-1-gpu
python3 run_suite.py --suite nightly-1-gpu --continue-on-error

nightly-test-4-gpu:
if: github.repository == 'sgl-project/sglang'
Expand All @@ -143,7 +143,7 @@ jobs:
timeout-minutes: 30
run: |
cd test/srt
python3 run_suite.py --suite nightly-4-gpu
python3 run_suite.py --suite nightly-4-gpu --continue-on-error

nightly-test-8-gpu-h200:
if: github.repository == 'sgl-project/sglang'
Expand All @@ -161,7 +161,7 @@ jobs:
timeout-minutes: 30
run: |
cd test/srt
python3 run_suite.py --suite nightly-8-gpu-h200
python3 run_suite.py --suite nightly-8-gpu-h200 --continue-on-error

nightly-test-8-gpu-h20:
if: github.repository == 'sgl-project/sglang'
Expand All @@ -181,7 +181,7 @@ jobs:
timeout-minutes: 30
run: |
cd test/srt
python3 run_suite.py --suite nightly-8-gpu-h20
python3 run_suite.py --suite nightly-8-gpu-h20 --continue-on-error

nightly-test-4-gpu-b200:
if: github.repository == 'sgl-project/sglang'
Expand All @@ -199,4 +199,4 @@ jobs:
timeout-minutes: 60
run: |
cd test/srt
python3 run_suite.py --suite nightly-4-gpu-b200
python3 run_suite.py --suite nightly-4-gpu-b200 --continue-on-error
53 changes: 47 additions & 6 deletions python/sglang/test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -735,9 +735,22 @@ class TestFile:
estimated_time: float = 60


def run_unittest_files(files: List[TestFile], timeout_per_file: float):
def run_unittest_files(
files: List[TestFile], timeout_per_file: float, continue_on_error: bool = False
):
"""
Run a list of test files.

Args:
files: List of TestFile objects to run
timeout_per_file: Timeout in seconds for each test file
continue_on_error: If True, continue running remaining tests even if one fails.
If False, stop at first failure (default behavior for PR tests).
"""
tic = time.perf_counter()
success = True
passed_tests = []
failed_tests = []

for i, file in enumerate(files):
filename, estimated_time = file.name, file.estimated_time
Expand Down Expand Up @@ -769,24 +782,52 @@ def run_one_file(filename):
ret_code = run_with_timeout(
run_one_file, args=(filename,), timeout=timeout_per_file
)
assert (
ret_code == 0
), f"expected return code 0, but {filename} returned {ret_code}"
if ret_code != 0:
print(
f"\n✗ FAILED: {filename} returned exit code {ret_code}\n",
flush=True,
)
success = False
failed_tests.append((filename, f"exit code {ret_code}"))
if not continue_on_error:
# Stop at first failure for PR tests
break
# Otherwise continue to next test for nightly tests
else:
passed_tests.append(filename)
except TimeoutError:
kill_process_tree(process.pid)
time.sleep(5)
print(
f"\nTimeout after {timeout_per_file} seconds when running {filename}\n",
f"\n✗ TIMEOUT: {filename} after {timeout_per_file} seconds\n",
flush=True,
)
success = False
break
failed_tests.append((filename, f"timeout after {timeout_per_file}s"))
if not continue_on_error:
# Stop at first timeout for PR tests
break
# Otherwise continue to next test for nightly tests
Comment on lines 769 to +810
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The failure handling logic for non-zero exit codes and timeouts is very similar and contains duplicated code. You can refactor this to reduce duplication and improve readability by handling the failure condition after the try...except block.

            failure_reason = None
            try:
                ret_code = run_with_timeout(
                    run_one_file, args=(filename,), timeout=timeout_per_file
                )
                if ret_code == 0:
                    passed_tests.append(filename)
                else:
                    print(
                        f"\n✗ FAILED: {filename} returned exit code {ret_code}\n",
                        flush=True,
                    )
                    failure_reason = f"exit code {ret_code}"
            except TimeoutError:
                kill_process_tree(process.pid)
                time.sleep(5)
                print(
                    f"\n✗ TIMEOUT: {filename} after {timeout_per_file} seconds\n",
                    flush=True,
                )
                failure_reason = f"timeout after {timeout_per_file}s"

            if failure_reason:
                success = False
                failed_tests.append((filename, failure_reason))
                if not continue_on_error:
                    break


if success:
print(f"Success. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)
else:
print(f"Fail. Time elapsed: {time.perf_counter() - tic:.2f}s", flush=True)

# Print summary
print(f"\n{'='*60}", flush=True)
print(f"Test Summary: {len(passed_tests)}/{len(files)} passed", flush=True)
print(f"{'='*60}", flush=True)
if passed_tests:
print("✓ PASSED:", flush=True)
for test in passed_tests:
print(f" {test}", flush=True)
if failed_tests:
print("\n✗ FAILED:", flush=True)
for test, reason in failed_tests:
print(f" {test} ({reason})", flush=True)
print(f"{'='*60}\n", flush=True)

return 0 if success else -1


Expand Down
8 changes: 7 additions & 1 deletion test/srt/run_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -645,6 +645,12 @@ def _sanity_check_suites(suites):
type=int,
help="Use auto load balancing. The number of parts.",
)
arg_parser.add_argument(
"--continue-on-error",
action="store_true",
default=False,
help="Continue running remaining tests even if one fails (useful for nightly tests)",
)
args = arg_parser.parse_args()
print(f"{args=}")

Expand All @@ -662,5 +668,5 @@ def _sanity_check_suites(suites):

print("The running tests are ", [f.name for f in files])

exit_code = run_unittest_files(files, args.timeout_per_file)
exit_code = run_unittest_files(files, args.timeout_per_file, args.continue_on_error)
exit(exit_code)
Loading