5858 strategy :
5959 fail-fast : false
6060 matrix :
61- runner : [linux-mi300 -gpu-1]
61+ runner : [linux-mi325 -gpu-1]
6262 runs-on : ${{matrix.runner}}
6363 steps :
6464 - name : Checkout code
9595 strategy :
9696 fail-fast : false
9797 matrix :
98- runner : [linux-mi300 -gpu-1]
98+ runner : [linux-mi325 -gpu-1]
9999 runs-on : ${{matrix.runner}}
100100 steps :
101101 - name : Checkout code
@@ -125,7 +125,7 @@ jobs:
125125 strategy :
126126 fail-fast : false
127127 matrix :
128- runner : [linux-mi300 -gpu-1]
128+ runner : [linux-mi325 -gpu-1]
129129 part : [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
130130 runs-on : ${{matrix.runner}}
131131 steps :
@@ -155,7 +155,7 @@ jobs:
155155 strategy :
156156 fail-fast : false
157157 matrix :
158- runner : [linux-mi300 -gpu-2]
158+ runner : [linux-mi325 -gpu-2]
159159 part : [0, 1]
160160 runs-on : ${{matrix.runner}}
161161 steps :
@@ -180,15 +180,16 @@ jobs:
180180
181181 unit-test-backend-8-gpu-amd :
182182 needs : [check-changes, unit-test-backend-2-gpu-amd]
183- if : always() && !failure() && !cancelled() &&
183+ # Temporarily disabled - uncomment when ready to re-enable
184+ if : false && always() && !failure() && !cancelled() &&
184185 ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
185186 env :
186187 RUNNER_LABELS : linux-mi300-gpu-8
187188 strategy :
188189 fail-fast : false
189190 matrix :
190191 runner : [linux-mi300-gpu-8]
191- part : [0, 1, 2 ]
192+ part : [0, 1]
192193 runs-on : ${{matrix.runner}}
193194 steps :
194195 - name : Checkout code
@@ -205,10 +206,16 @@ jobs:
205206 - name : Install dependencies
206207 run : bash scripts/ci/amd_ci_install_dependency.sh
207208
209+ - name : Test RCCL multi-GPU communication
210+ timeout-minutes : 5
211+ run : |
212+ echo "Testing RCCL multi-GPU communication with debug info..."
213+ docker exec ci_sglang bash -c "cd /sglang-checkout && NCCL_DEBUG=INFO RCCL_DEBUG=INFO torchrun --nproc_per_node=8 scripts/ci/test_rccl_multi_gpu.py"
214+
208215 - name : Run test
209216 timeout-minutes : 60
210217 run : |
211- bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600
218+ bash scripts/ci/amd_ci_exec.sh python3 run_suite.py --suite per-commit-8-gpu-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 3600
212219
213220 performance-test-1-gpu-part-1-amd :
214221 needs : [check-changes, stage-a-test-1-amd]
@@ -217,7 +224,7 @@ jobs:
217224 strategy :
218225 fail-fast : false
219226 matrix :
220- runner : [linux-mi300 -gpu-1]
227+ runner : [linux-mi325 -gpu-1]
221228 runs-on : ${{matrix.runner}}
222229 steps :
223230 - name : Checkout code
@@ -262,7 +269,7 @@ jobs:
262269 strategy :
263270 fail-fast : false
264271 matrix :
265- runner : [linux-mi300 -gpu-1]
272+ runner : [linux-mi325 -gpu-1]
266273 runs-on : ${{matrix.runner}}
267274 steps :
268275 - name : Checkout code
@@ -301,7 +308,7 @@ jobs:
301308 strategy :
302309 fail-fast : false
303310 matrix :
304- runner : [linux-mi300 -gpu-2]
311+ runner : [linux-mi325 -gpu-2]
305312 runs-on : ${{matrix.runner}}
306313 steps :
307314 - name : Checkout code
@@ -350,7 +357,7 @@ jobs:
350357 strategy :
351358 fail-fast : false
352359 matrix :
353- runner : [linux-mi300 -gpu-1]
360+ runner : [linux-mi325 -gpu-1]
354361 runs-on : ${{matrix.runner}}
355362 steps :
356363 - name : Checkout code
@@ -381,7 +388,7 @@ jobs:
381388 strategy :
382389 fail-fast : false
383390 matrix :
384- runner : [linux-mi300 -gpu-2]
391+ runner : [linux-mi325 -gpu-2]
385392 runs-on : ${{matrix.runner}}
386393 steps :
387394 - name : Checkout code
@@ -401,7 +408,7 @@ jobs:
401408 - name : Evaluate accuracy (TP=2)
402409 timeout-minutes : 30
403410 run : |
404- bash scripts/ci/amd_ci_exec.sh -e SGLANG_USE_AITER=0 python3 test_moe_eval_accuracy_large.py
411+ bash scripts/ci/amd_ci_exec.sh -e SGLANG_USE_AITER=0 -e HF_HUB_ENABLE_HF_TRANSFER=0 python3 test_moe_eval_accuracy_large.py
405412
406413 pr-test-amd-finish :
407414 needs :
@@ -414,7 +421,7 @@ jobs:
414421 stage-a-test-1-amd,
415422 unit-test-backend-1-gpu-amd,
416423 unit-test-backend-2-gpu-amd,
417- unit-test-backend-8-gpu-amd,
424+ # unit-test-backend-8-gpu-amd, # Temporarily disabled
418425 performance-test-1-gpu-part-1-amd,
419426 performance-test-1-gpu-part-2-amd,
420427 performance-test-2-gpu-amd,
0 commit comments