Skip to content

Skip sequence parallel operations during eval (#7821) #310

Skip sequence parallel operations during eval (#7821)

Skip sequence parallel operations during eval (#7821) #310

################################################################################
# DeepSpeed CI - AWS L40S GPU Tests (PyTorch Latest)
#
# Runs the same tests as modal-torch-latest.yml but on AWS self-hosted runners.
# Uses 4x NVIDIA L40S GPUs on g6e.12xlarge instances.
################################################################################
name: aws-torch-latest
on:
workflow_dispatch:
push:
branches:
- master
pull_request:
branches:
- master
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
check-paths:
name: Check Paths
runs-on: ubuntu-latest
outputs:
should_run: ${{ steps.filter.outputs.run_tests }}
steps:
- uses: actions/checkout@v4
- uses: dorny/paths-filter@v3
id: filter
with:
filters: |
run_tests:
- '**'
- '!docs/**'
- '!blogs/**'
- '!deepspeed/inference/v2/**'
- '!tests/unit/inference/v2/**'
unit-tests:
name: Unit Tests (V1)
needs: check-paths
if: needs.check-paths.outputs.should_run == 'true'
runs-on: [self-hosted, gpu-ci, gpu-l40s, l40s-4gpu, aws]
timeout-minutes: 60
container:
image: nvidia/cuda:12.6.3-devel-ubuntu22.04
options: --gpus all --shm-size "32G"
env:
TORCH_VER: "2.7"
CUDA_VER: "12.6"
steps:
- name: Install system dependencies
run: |
apt-get update && apt-get install -y git git-lfs libaio-dev python3 python3-pip
git lfs install
ln -sf /usr/bin/python3 /usr/bin/python
- name: Checkout repository
uses: actions/checkout@v4
with:
lfs: true
- name: Install PyTorch
run: |
pip install torch==2.7.1 torchvision==0.22.1 torchaudio==2.7.1 --index-url https://download.pytorch.org/whl/cu126
- name: Install Python dependencies
run: |
pip install --upgrade pip
pip install -r requirements/requirements.txt
pip install -r requirements/requirements-dev.txt
pip install -r requirements/requirements-deepcompile.txt
- name: Check environment
run: |
echo "=== GPU Information ==="
nvidia-smi
echo ""
echo "=== CUDA Version ==="
nvcc --version
echo ""
echo "=== Python/PyTorch Info ==="
python --version
python -c "import torch; print(f'PyTorch: {torch.__version__}')"
python -c "import torch; print(f'CUDA available: {torch.cuda.is_available()}')"
python -c "import torch; print(f'CUDA devices: {torch.cuda.device_count()}')"
python -c "import torch; print(f'BF16 support: {torch.cuda.is_bf16_supported()}')"
- name: Install DeepSpeed
run: |
# Initialize CUDA before install so setup.py can detect NCCL version
python -c "import torch; torch.cuda.init(); print(f'NCCL version: {torch.cuda.nccl.version()}')"
# Use --no-build-isolation so setup.py can access pre-installed PyTorch
pip install --no-build-isolation .
ds_report
# Debug: Check captured torch_info values
python -c "from deepspeed.git_version_info import torch_info; print(f'torch_info: {torch_info}')"
- name: Run unit tests
run: |
pytest -n 4 --forked --verbose tests/unit/v1/ --torch_ver=${{ env.TORCH_VER }} --cuda_ver=${{ env.CUDA_VER }}