sgl-project · iforgetmyname · Dec 4, 2025 · Nov 29, 2025 · Nov 29, 2025 · Nov 29, 2025
@@ -0,0 +1,186 @@
+name: Nightly Test (NPU)
+
+on:
+  schedule:
+    - cron: '0 17 * * *'  # Execute at 1:00 a.m. Beijing Time every day
+  pull_request:
+    branches:
+      - main
+    paths:
+      - ".github/workflows/nightly-test-npu.yml"
+  workflow_dispatch:
+
+concurrency:
+  group: nightly-test-npu-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  nightly-1-npu-a3:
+    if: ${{ (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') }}
+    runs-on: linux-aarch64-a3-2
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0, 1]
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-a3-ubuntu22.04-py3.11
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          # speed up by using infra cache services
+          CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
+          sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
+          pip config set global.index-url http://${CACHING_URL}/pypi/simple
+          pip config set global.extra-index-url "https://pypi.tuna.tsinghua.edu.cn/simple"
+          pip config set global.trusted-host "${CACHING_URL} pypi.tuna.tsinghua.edu.cn"
+
+          bash scripts/ci/npu_ci_install_dependency.sh a3
+          # copy required file from our daily cache
+          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
+          # copy download through proxy
+          curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+
+      - name: Print Log Information
+        run: |
+          bash scripts/ci/npu_log_print.sh
+      - name: Run test
+        timeout-minutes: 240
+        env:
+          SGLANG_USE_MODELSCOPE: true
+          SGLANG_IS_IN_CI: true
+          HF_ENDPOINT: https://hf-mirror.com
+          TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
+          PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
+          STREAMS_PER_DEVICE: 32
+        run: |
+          export PATH="/usr/local/Ascend/8.3.RC1/compiler/bishengir/bin:${PATH}"
+          pip install sentence_transformers accelerate
+          cd test
+          python3 run_suite.py --hw npu --suite nightly-1-npu-a3 --nightly --continue-on-error --timeout-per-file 3600 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
+
+  nightly-2-npu-a3:
+    if: ${{ (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') }}
+    runs-on: linux-aarch64-a3-2
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0]
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-a3-ubuntu22.04-py3.11
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          # speed up by using infra cache services
+          CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
+          sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
+          pip config set global.index-url http://${CACHING_URL}/pypi/simple
+          pip config set global.extra-index-url "https://pypi.tuna.tsinghua.edu.cn/simple"
+          pip config set global.trusted-host "${CACHING_URL} pypi.tuna.tsinghua.edu.cn"
+
+          bash scripts/ci/npu_ci_install_dependency.sh a3
+          # copy required file from our daily cache
+          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
+          # copy download through proxy
+          curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+
+      - name: Print Log Information
+        run: |
+          bash scripts/ci/npu_log_print.sh
+      - name: Run test
+        timeout-minutes: 240
+        env:
+          SGLANG_USE_MODELSCOPE: true
+          SGLANG_IS_IN_CI: true
+          HF_ENDPOINT: https://hf-mirror.com
+          TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
+          PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
+          STREAMS_PER_DEVICE: 32
+        run: |
+          export PATH="/usr/local/Ascend/8.3.RC1/compiler/bishengir/bin:${PATH}"
+          pip install sentence_transformers accelerate
+          cd test
+          python3 run_suite.py --hw npu --suite nightly-2-npu-a3 --nightly --continue-on-error --timeout-per-file 3600 --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
+
+  nightly-4-npu-a3:
+    if: ${{ (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') }}
+    runs-on: linux-aarch64-a3-4
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0]
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-a3-ubuntu22.04-py3.11
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          # speed up by using infra cache services
+          CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
+          sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
+          pip config set global.index-url http://${CACHING_URL}/pypi/simple
+          pip config set global.extra-index-url "https://pypi.tuna.tsinghua.edu.cn/simple"
+          pip config set global.trusted-host "${CACHING_URL} pypi.tuna.tsinghua.edu.cn"
+
+          bash scripts/ci/npu_ci_install_dependency.sh a3
+          # copy required file from our daily cache
+          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
+          # copy download through proxy
+          curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+
+      - name: Print Log Information
+        run: |
+          bash scripts/ci/npu_log_print.sh
+
+      - name: Run test
+        timeout-minutes: 240
+        env:
+          SGLANG_USE_MODELSCOPE: true
+          SGLANG_IS_IN_CI: true
+          HF_ENDPOINT: https://hf-mirror.com
+          TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
+          PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
+          STREAMS_PER_DEVICE: 32
+        run: |
+          export PATH="/usr/local/Ascend/8.3.RC1/compiler/bishengir/bin:${PATH}"
+          hf download lmms-lab/MMMU --repo-type dataset
+          pip install sentence_transformers torchaudio==2.8.0 torch_npu==2.8.0
+          pip install protobuf==6.31.1 zss pre-commit wandb>=0.16.0 tenacity==8.3.0 loguru openpyxl latex2sympy2 zstandard transformers-stream-generator tqdm-multiprocess pycocoevalcap
+          pip install yt-dlp sentencepiece==0.1.99 nltk av ftfy sqlitedict==2.1.0 sacrebleu>=1.5.0 pytablewriter peft==0.2.0 black==24.1.0 isort==5.13.2 peft>=0.2.0 accelerate>=0.29.1
+          pip install jsonlines httpx==0.25.0 evaluate>=0.4.0 datasets==2.16.1 numexpr xgrammar==0.1.25 numpy==1.26.4 dotenv
+          git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
+          cd ./lmms-eval
+          nohup pip install . > lmmslog.txt 2>&1 &
+          sleep 120
+          export PYTHONPATH=$PYTHONPATH:$(pwd)
+          cd ../
+          cd test
+          python3 run_suite.py --hw npu --suite nightly-4-npu-a3 --nightly --continue-on-error --timeout-per-file 3600 --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
+
+  check-all-jobs:
+    if: github.repository == 'sgl-project/sglang' && always()
+    needs:
+      - nightly-1-npu-a3
+      - nightly-4-npu-a3
+    runs-on: ubuntu-latest
+    container:
+      image: docker.m.daocloud.io/ubuntu:22.04
+    steps:
+      - name: Check if any job failed
+        run: |
+          if [[ "${{ contains(needs.*.result, 'failure') }}" == "true" ]]; then
+            echo "One or more nightly test jobs failed"
+            exit 1
+          fi
+          if [[ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]]; then
+            echo "One or more nightly test jobs were cancelled"
+            exit 1
+          fi
+          echo "All nightly test jobs passed"
diff --git a/python/sglang/test/ci/ci_register.py b/python/sglang/test/ci/ci_register.py
@@ -11,6 +11,7 @@
     "register_cpu_ci",
     "register_cuda_ci",
     "register_amd_ci",
+    "register_npu_ci",
     "ut_parse_one_file",
 ]
 
@@ -22,6 +23,7 @@ class HWBackend(Enum):
     CPU = auto()
     CUDA = auto()
     AMD = auto()
+    NPU = auto()
 
 
 @dataclass
@@ -58,10 +60,21 @@ def register_amd_ci(
     return None
 
 
+def register_npu_ci(
+    est_time: float,
+    suite: str,
+    nightly: bool = False,
+    disabled: Optional[str] = None,
+):
+    """Marker for NPU CI registration (parsed via AST; runtime no-op)."""
+    return None
+
+
 REGISTER_MAPPING = {
     "register_cpu_ci": HWBackend.CPU,
     "register_cuda_ci": HWBackend.CUDA,
     "register_amd_ci": HWBackend.AMD,
+    "register_npu_ci": HWBackend.NPU,
 }
 
 

diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py
@@ -31,10 +31,15 @@
 )
 
 from sglang.srt.entrypoints.engine import Engine
-from sglang.srt.utils import load_image
+from sglang.srt.utils import is_npu, load_image
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, calculate_rouge_l
 
+if is_npu():
+    from sglang.srt.hardware_backend.npu.utils import init_npu_backend
+
+    init_npu_backend()
+
 DEFAULT_PROMPTS = [
     "Apple is red. Banana is Yellow. " * 800 + "Apple is",
     "The capital of the United Kingdom is",
@@ -72,6 +77,8 @@ def get_dtype_str(torch_dtype):
         return "float16"
     if torch_dtype is torch.float32:
         return "float32"
+    if torch_dtype is torch.bfloat16:
+        return "bfloat16"
     else:
         raise NotImplementedError()
 

diff --git a/scripts/ci/npu_log_print.sh b/scripts/ci/npu_log_print.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+set -euo pipefail
+
+# Print log information(sglang version, commit sha, sgl-kernel-npu version, sgl-kernel-npu commit sha, npu-smi info and pip list.
+npu-smi info
+pip list
+get_version() {
+    [ -f "$1" ] && python3 -c 'import re, sys; print(sys.argv[2] + " version: v" + re.search(r"__version__\s*=\s*[\"'"'"'](.*?)[\"'"'"']", open(sys.argv[1]).read()).group(1))' "$1" "$2" 2>/dev/null || echo "$2 version: unknown"
+}
+get_version "./python/sglang/version.py" "sglang"
+get_version "./sgl-kernel/python/sgl_kernel/version.py" "sgl_kernel"
+SGLANG_URL="https://github.com/sgl-project/sglang.git"
+SGL_KERNEL_URL="https://github.com/sgl-project/sgl-kernel-npu.git"
+SGLANG_BRANCH="main"
+SGL_KERNEL_BRANCH="main"
+get_sha() {
+    local name="$1"
+    local url="$2"
+    local branch="$3"
+    local sha
+    sha=$(git ls-remote "$url" "refs/heads/$branch" | cut -f1)
+    echo "$name SHA for branch $branch: ${sha:-"Not Found"}"
+}
+get_sha "sglang" "$SGLANG_URL" "$SGLANG_BRANCH"
+get_sha "sgl-kernel" "$SGL_KERNEL_URL" "$SGL_KERNEL_BRANCH"
+chmod +x scripts/ci/npu_log_print.sh
diff --git a/test/nightly/ascend/embedding_models/test_ascend_embedding_models.py b/test/nightly/ascend/embedding_models/test_ascend_embedding_models.py
@@ -0,0 +1,108 @@
+import multiprocessing as mp
+import unittest
+from typing import Optional
+
+import torch
+from transformers import AutoConfig, AutoTokenizer
+
+from sglang.test.ci.ci_register import register_npu_ci
+from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
+from sglang.test.test_utils import CustomTestCase, get_similarities
+
+register_npu_ci(
+    est_time=400,
+    suite="nightly-1-npu-a3",
+    nightly=True,
+    disabled="embeddings are not all close",
+)
+
+
+MODELS = [
+    ("/root/.cache/modelscope/hub/models/iic/gte_Qwen2-1.5B-instruct", 1, 1e-5),
+    ("/root/.cache/modelscope/hub/models/Qwen/Qwen3-Embedding-8B", 1, 1e-5),
+]
+TORCH_DTYPES = [torch.bfloat16]
+
+
+class TestEmbeddingModels(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def _truncate_prompts(self, prompts, model_path):
+        config = AutoConfig.from_pretrained(model_path)
+        max_length = getattr(config, "max_position_embeddings", 2048)
+
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+        truncated_prompts = []
+        for prompt in prompts:
+            tokens = tokenizer(prompt, return_tensors="pt", truncation=False)
+            if len(tokens.input_ids[0]) > max_length:
+                truncated_text = tokenizer.decode(
+                    tokens.input_ids[0][: max_length - 1], skip_special_tokens=True
+                )
+                truncated_prompts.append(truncated_text)
+            else:
+                truncated_prompts.append(prompt)
+        return truncated_prompts
+
+    def assert_close_prefill_logits(
+        self,
+        prompts,
+        model_path,
+        tp_size,
+        torch_dtype,
+        prefill_tolerance,
+        matryoshka_dim: Optional[int] = None,
+    ) -> None:
+        truncated_prompts = self._truncate_prompts(prompts, model_path)
+
+        with HFRunner(
+            model_path,
+            torch_dtype=torch_dtype,
+            model_type="embedding",
+            matryoshka_dim=matryoshka_dim,
+        ) as hf_runner:
+            hf_outputs = hf_runner.forward(truncated_prompts)
+
+        attention_backend = "ascend"
+        with SRTRunner(
+            model_path,
+            tp_size=tp_size,
+            torch_dtype=torch_dtype,
+            model_type="embedding",
+            attention_backend=attention_backend,
+            json_model_override_args=(
+                {"matryoshka_dimensions": [matryoshka_dim]} if matryoshka_dim else None
+            ),
+        ) as srt_runner:
+            srt_outputs = srt_runner.forward(
+                truncated_prompts, dimensions=matryoshka_dim
+            )
+
+        for i in range(len(prompts)):
+            hf_logits = torch.Tensor(hf_outputs.embed_logits[i])
+            srt_logits = torch.Tensor(srt_outputs.embed_logits[i])
+
+            similarity = torch.tensor(get_similarities(hf_logits, srt_logits))
+            print("similarity diff", abs(similarity - 1))
+
+            if len(prompts[i]) <= 1000:
+                assert torch.all(
+                    abs(similarity - 1) < prefill_tolerance
+                ), "embeddings are not all close"
+
+    def test_prefill_logits(self):
+        models_to_test = MODELS
+
+        for model, tp_size, prefill_tolerance in models_to_test:
+            for torch_dtype in TORCH_DTYPES:
+                self.assert_close_prefill_logits(
+                    DEFAULT_PROMPTS, model, tp_size, torch_dtype, prefill_tolerance
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()