Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion .github/workflows/model_jobs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -138,10 +138,16 @@ jobs:
- name: Run all tests on GPU
working-directory: /transformers
run: |
PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}
script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}" test_outputs.txt
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know much about this command (Claude AI suggest this)

The 'script' command in Linux is a versatile tool that allows you to record all terminal activities, including inputs and outputs, making it a valuable resource for developers, system administrators, educators, and anyone who needs to document terminal sessions. This command captures everything displayed on your screen during the session, saving it in a file called a typescript.

reference

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This command will always succeed, so we need to check the exit code below and fail the step accordingly.

ls -la
# Extract the exit code from the output file
PYTEST_EXIT_CODE=$(tail -1 test_outputs.txt | grep "PYTEST_EXIT_CODE:" | cut -d: -f2)
exit ${PYTEST_EXIT_CODE:-1}

- name: Failure short reports
if: ${{ failure() }}
# This step is only to show information on Github Actions log.
# Always mark this step as successful, even if the report directory or the file `failures_short.txt` in it doesn't exist
continue-on-error: true
run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/failures_short.txt

Expand All @@ -151,6 +157,12 @@ jobs:
run: |
cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/captured_info.txt

- name: Copy test_outputs.txt
if: ${{ always() }}
continue-on-error: true
run: |
cp /transformers/test_outputs.txt /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports

- name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
if: ${{ always() }}
uses: actions/upload-artifact@v4
Expand Down
12 changes: 11 additions & 1 deletion utils/notification_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,9 +158,11 @@ def __init__(
self.n_model_failures = (
self.n_model_single_gpu_failures + self.n_model_multi_gpu_failures + self.n_model_unknown_failures
)
self.n_model_jobs_errored_out = sum(r["error"] for r in model_results.values())

# Failures and success of the additional tests
self.n_additional_success = sum(r["success"] for r in additional_results.values())
self.n_additional_jobs_errored_out = sum(r["error"] for r in additional_results.values())

if len(additional_results) > 0:
# `dicts_to_sum` uses `dicts_to_sum` which requires a non empty dictionary. Let's just add an empty entry.
Expand All @@ -183,6 +185,7 @@ def __init__(
self.n_failures = self.n_model_failures + self.n_additional_failures
self.n_success = self.n_model_success + self.n_additional_success
self.n_tests = self.n_failures + self.n_success
self.n_jobs_errored_out = self.n_model_jobs_errored_out + self.n_additional_jobs_errored_out

self.model_results = model_results
self.additional_results = additional_results
Expand Down Expand Up @@ -241,6 +244,7 @@ def failures(self) -> dict:
"type": "plain_text",
"text": (
f"There were {self.n_failures} failures, out of {self.n_tests} tests.\n"
f"🚨 There were {self.n_jobs_errored_out} jobs errored out (not producing test output files).\n"
f"The suite ran in {self.time}."
),
"emoji": True,
Expand Down Expand Up @@ -561,7 +565,7 @@ def payload(self) -> str:
if self.ci_title:
blocks.append(self.ci_title_section)

if self.n_model_failures > 0 or self.n_additional_failures > 0:
if self.n_model_failures > 0 or self.n_additional_failures > 0 or self.n_jobs_errored_out > 0:
blocks.append(self.failures)

if self.n_model_failures > 0:
Expand Down Expand Up @@ -1194,6 +1198,7 @@ def pop_default(l: list[Any], i: int, default: Any) -> Any:
"success": 0,
"skipped": 0,
"time_spent": [],
"error": False,
"failures": {},
"job_link": {},
"captured_info": {},
Expand Down Expand Up @@ -1222,6 +1227,11 @@ def pop_default(l: list[Any], i: int, default: Any) -> Any:
continue

artifact = retrieve_artifact(path, artifact_gpu)

if "summary_short" not in artifact:
# The process might be killed (for example, CPU OOM), or the job is canceled for some reason), etc.
matrix_job_results[matrix_name]["error"] = True

if "stats" in artifact:
# Link to the GitHub Action job
job = artifact_name_to_job_map[path]
Expand Down