diff --git a/.github/workflows/nightly-test-npu.yml b/.github/workflows/nightly-test-npu.yml
new file mode 100644
index 000000000000..68689265d691
--- /dev/null
+++ b/.github/workflows/nightly-test-npu.yml
@@ -0,0 +1,186 @@
+name: Nightly Test (NPU)
+
+on:
+  schedule:
+    - cron: '0 17 * * *'  # Execute at 1:00 a.m. Beijing Time every day
+  pull_request:
+    branches:
+      - main
+    paths:
+      - ".github/workflows/nightly-test-npu.yml"
+  workflow_dispatch:
+
+concurrency:
+  group: nightly-test-npu-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  nightly-1-npu-a3:
+    if: ${{ (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') }}
+    runs-on: linux-aarch64-a3-2
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0, 1]
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-a3-ubuntu22.04-py3.11
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          # speed up by using infra cache services
+          CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
+          sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
+          pip config set global.index-url http://${CACHING_URL}/pypi/simple
+          pip config set global.extra-index-url "https://pypi.tuna.tsinghua.edu.cn/simple"
+          pip config set global.trusted-host "${CACHING_URL} pypi.tuna.tsinghua.edu.cn"
+
+          bash scripts/ci/npu_ci_install_dependency.sh a3
+          # copy required file from our daily cache
+          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
+          # copy download through proxy
+          curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+
+      - name: Print Log Information
+        run: |
+          bash scripts/ci/npu_log_print.sh
+      - name: Run test
+        timeout-minutes: 240
+        env:
+          SGLANG_USE_MODELSCOPE: true
+          SGLANG_IS_IN_CI: true
+          HF_ENDPOINT: https://hf-mirror.com
+          TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
+          PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
+          STREAMS_PER_DEVICE: 32
+        run: |
+          export PATH="/usr/local/Ascend/8.3.RC1/compiler/bishengir/bin:${PATH}"
+          pip install sentence_transformers accelerate
+          cd test
+          python3 run_suite.py --hw npu --suite nightly-1-npu-a3 --nightly --continue-on-error --timeout-per-file 3600 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
+
+  nightly-2-npu-a3:
+    if: ${{ (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') }}
+    runs-on: linux-aarch64-a3-2
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0]
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-a3-ubuntu22.04-py3.11
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          # speed up by using infra cache services
+          CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
+          sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
+          pip config set global.index-url http://${CACHING_URL}/pypi/simple
+          pip config set global.extra-index-url "https://pypi.tuna.tsinghua.edu.cn/simple"
+          pip config set global.trusted-host "${CACHING_URL} pypi.tuna.tsinghua.edu.cn"
+
+          bash scripts/ci/npu_ci_install_dependency.sh a3
+          # copy required file from our daily cache
+          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
+          # copy download through proxy
+          curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+
+      - name: Print Log Information
+        run: |
+          bash scripts/ci/npu_log_print.sh
+      - name: Run test
+        timeout-minutes: 240
+        env:
+          SGLANG_USE_MODELSCOPE: true
+          SGLANG_IS_IN_CI: true
+          HF_ENDPOINT: https://hf-mirror.com
+          TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
+          PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
+          STREAMS_PER_DEVICE: 32
+        run: |
+          export PATH="/usr/local/Ascend/8.3.RC1/compiler/bishengir/bin:${PATH}"
+          pip install sentence_transformers accelerate
+          cd test
+          python3 run_suite.py --hw npu --suite nightly-2-npu-a3 --nightly --continue-on-error --timeout-per-file 3600 --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
+
+  nightly-4-npu-a3:
+    if: ${{ (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') }}
+    runs-on: linux-aarch64-a3-4
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0]
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-a3-ubuntu22.04-py3.11
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          # speed up by using infra cache services
+          CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
+          sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
+          pip config set global.index-url http://${CACHING_URL}/pypi/simple
+          pip config set global.extra-index-url "https://pypi.tuna.tsinghua.edu.cn/simple"
+          pip config set global.trusted-host "${CACHING_URL} pypi.tuna.tsinghua.edu.cn"
+
+          bash scripts/ci/npu_ci_install_dependency.sh a3
+          # copy required file from our daily cache
+          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
+          # copy download through proxy
+          curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+
+      - name: Print Log Information
+        run: |
+          bash scripts/ci/npu_log_print.sh
+
+      - name: Run test
+        timeout-minutes: 240
+        env:
+          SGLANG_USE_MODELSCOPE: true
+          SGLANG_IS_IN_CI: true
+          HF_ENDPOINT: https://hf-mirror.com
+          TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
+          PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
+          STREAMS_PER_DEVICE: 32
+        run: |
+          export PATH="/usr/local/Ascend/8.3.RC1/compiler/bishengir/bin:${PATH}"
+          hf download lmms-lab/MMMU --repo-type dataset
+          pip install sentence_transformers torchaudio==2.8.0 torch_npu==2.8.0
+          pip install protobuf==6.31.1 zss pre-commit wandb>=0.16.0 tenacity==8.3.0 loguru openpyxl latex2sympy2 zstandard transformers-stream-generator tqdm-multiprocess pycocoevalcap
+          pip install yt-dlp sentencepiece==0.1.99 nltk av ftfy sqlitedict==2.1.0 sacrebleu>=1.5.0 pytablewriter peft==0.2.0 black==24.1.0 isort==5.13.2 peft>=0.2.0 accelerate>=0.29.1
+          pip install jsonlines httpx==0.25.0 evaluate>=0.4.0 datasets==2.16.1 numexpr xgrammar==0.1.25 numpy==1.26.4 dotenv
+          git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
+          cd ./lmms-eval
+          nohup pip install . > lmmslog.txt 2>&1 &
+          sleep 120
+          export PYTHONPATH=$PYTHONPATH:$(pwd)
+          cd ../
+          cd test
+          python3 run_suite.py --hw npu --suite nightly-4-npu-a3 --nightly --continue-on-error --timeout-per-file 3600 --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
+
+  check-all-jobs:
+    if: github.repository == 'sgl-project/sglang' && always()
+    needs:
+      - nightly-1-npu-a3
+      - nightly-4-npu-a3
+    runs-on: ubuntu-latest
+    container:
+      image: docker.m.daocloud.io/ubuntu:22.04
+    steps:
+      - name: Check if any job failed
+        run: |
+          if [[ "${{ contains(needs.*.result, 'failure') }}" == "true" ]]; then
+            echo "One or more nightly test jobs failed"
+            exit 1
+          fi
+          if [[ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]]; then
+            echo "One or more nightly test jobs were cancelled"
+            exit 1
+          fi
+          echo "All nightly test jobs passed"
diff --git a/python/sglang/test/ci/ci_register.py b/python/sglang/test/ci/ci_register.py
index 45023fc12b2e..fdd72de44955 100644
--- a/python/sglang/test/ci/ci_register.py
+++ b/python/sglang/test/ci/ci_register.py
@@ -11,6 +11,7 @@
     "register_cpu_ci",
     "register_cuda_ci",
     "register_amd_ci",
+    "register_npu_ci",
     "ut_parse_one_file",
 ]
 
@@ -22,6 +23,7 @@ class HWBackend(Enum):
     CPU = auto()
     CUDA = auto()
     AMD = auto()
+    NPU = auto()
 
 
 @dataclass
@@ -58,10 +60,21 @@ def register_amd_ci(
     return None
 
 
+def register_npu_ci(
+    est_time: float,
+    suite: str,
+    nightly: bool = False,
+    disabled: Optional[str] = None,
+):
+    """Marker for NPU CI registration (parsed via AST; runtime no-op)."""
+    return None
+
+
 REGISTER_MAPPING = {
     "register_cpu_ci": HWBackend.CPU,
     "register_cuda_ci": HWBackend.CUDA,
     "register_amd_ci": HWBackend.AMD,
+    "register_npu_ci": HWBackend.NPU,
 }
 
 
diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py
index e174eb0c2f39..6d31b5868a47 100644
--- a/python/sglang/test/runners.py
+++ b/python/sglang/test/runners.py
@@ -31,10 +31,15 @@
 )
 
 from sglang.srt.entrypoints.engine import Engine
-from sglang.srt.utils import load_image
+from sglang.srt.utils import is_npu, load_image
 from sglang.srt.utils.hf_transformers_utils import get_tokenizer
 from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, calculate_rouge_l
 
+if is_npu():
+    from sglang.srt.hardware_backend.npu.utils import init_npu_backend
+
+    init_npu_backend()
+
 DEFAULT_PROMPTS = [
     "Apple is red. Banana is Yellow. " * 800 + "Apple is",
     "The capital of the United Kingdom is",
@@ -72,6 +77,8 @@ def get_dtype_str(torch_dtype):
         return "float16"
     if torch_dtype is torch.float32:
         return "float32"
+    if torch_dtype is torch.bfloat16:
+        return "bfloat16"
     else:
         raise NotImplementedError()
 
diff --git a/scripts/ci/npu_log_print.sh b/scripts/ci/npu_log_print.sh
new file mode 100755
index 000000000000..ad59bc594cbc
--- /dev/null
+++ b/scripts/ci/npu_log_print.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+set -euo pipefail
+
+# Print log information(sglang version, commit sha, sgl-kernel-npu version, sgl-kernel-npu commit sha, npu-smi info and pip list.
+npu-smi info
+pip list
+get_version() {
+    [ -f "$1" ] && python3 -c 'import re, sys; print(sys.argv[2] + " version: v" + re.search(r"__version__\s*=\s*[\"'"'"'](.*?)[\"'"'"']", open(sys.argv[1]).read()).group(1))' "$1" "$2" 2>/dev/null || echo "$2 version: unknown"
+}
+get_version "./python/sglang/version.py" "sglang"
+get_version "./sgl-kernel/python/sgl_kernel/version.py" "sgl_kernel"
+SGLANG_URL="https://github.com/sgl-project/sglang.git"
+SGL_KERNEL_URL="https://github.com/sgl-project/sgl-kernel-npu.git"
+SGLANG_BRANCH="main"
+SGL_KERNEL_BRANCH="main"
+get_sha() {
+    local name="$1"
+    local url="$2"
+    local branch="$3"
+    local sha
+    sha=$(git ls-remote "$url" "refs/heads/$branch" | cut -f1)
+    echo "$name SHA for branch $branch: ${sha:-"Not Found"}"
+}
+get_sha "sglang" "$SGLANG_URL" "$SGLANG_BRANCH"
+get_sha "sgl-kernel" "$SGL_KERNEL_URL" "$SGL_KERNEL_BRANCH"
+chmod +x scripts/ci/npu_log_print.sh
diff --git a/test/nightly/ascend/embedding_models/test_ascend_embedding_models.py b/test/nightly/ascend/embedding_models/test_ascend_embedding_models.py
new file mode 100644
index 000000000000..33c5e3cfd91e
--- /dev/null
+++ b/test/nightly/ascend/embedding_models/test_ascend_embedding_models.py
@@ -0,0 +1,108 @@
+import multiprocessing as mp
+import unittest
+from typing import Optional
+
+import torch
+from transformers import AutoConfig, AutoTokenizer
+
+from sglang.test.ci.ci_register import register_npu_ci
+from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner
+from sglang.test.test_utils import CustomTestCase, get_similarities
+
+register_npu_ci(
+    est_time=400,
+    suite="nightly-1-npu-a3",
+    nightly=True,
+    disabled="embeddings are not all close",
+)
+
+
+MODELS = [
+    ("/root/.cache/modelscope/hub/models/iic/gte_Qwen2-1.5B-instruct", 1, 1e-5),
+    ("/root/.cache/modelscope/hub/models/Qwen/Qwen3-Embedding-8B", 1, 1e-5),
+]
+TORCH_DTYPES = [torch.bfloat16]
+
+
+class TestEmbeddingModels(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def _truncate_prompts(self, prompts, model_path):
+        config = AutoConfig.from_pretrained(model_path)
+        max_length = getattr(config, "max_position_embeddings", 2048)
+
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+        truncated_prompts = []
+        for prompt in prompts:
+            tokens = tokenizer(prompt, return_tensors="pt", truncation=False)
+            if len(tokens.input_ids[0]) > max_length:
+                truncated_text = tokenizer.decode(
+                    tokens.input_ids[0][: max_length - 1], skip_special_tokens=True
+                )
+                truncated_prompts.append(truncated_text)
+            else:
+                truncated_prompts.append(prompt)
+        return truncated_prompts
+
+    def assert_close_prefill_logits(
+        self,
+        prompts,
+        model_path,
+        tp_size,
+        torch_dtype,
+        prefill_tolerance,
+        matryoshka_dim: Optional[int] = None,
+    ) -> None:
+        truncated_prompts = self._truncate_prompts(prompts, model_path)
+
+        with HFRunner(
+            model_path,
+            torch_dtype=torch_dtype,
+            model_type="embedding",
+            matryoshka_dim=matryoshka_dim,
+        ) as hf_runner:
+            hf_outputs = hf_runner.forward(truncated_prompts)
+
+        attention_backend = "ascend"
+        with SRTRunner(
+            model_path,
+            tp_size=tp_size,
+            torch_dtype=torch_dtype,
+            model_type="embedding",
+            attention_backend=attention_backend,
+            json_model_override_args=(
+                {"matryoshka_dimensions": [matryoshka_dim]} if matryoshka_dim else None
+            ),
+        ) as srt_runner:
+            srt_outputs = srt_runner.forward(
+                truncated_prompts, dimensions=matryoshka_dim
+            )
+
+        for i in range(len(prompts)):
+            hf_logits = torch.Tensor(hf_outputs.embed_logits[i])
+            srt_logits = torch.Tensor(srt_outputs.embed_logits[i])
+
+            similarity = torch.tensor(get_similarities(hf_logits, srt_logits))
+            print("similarity diff", abs(similarity - 1))
+
+            if len(prompts[i]) <= 1000:
+                assert torch.all(
+                    abs(similarity - 1) < prefill_tolerance
+                ), "embeddings are not all close"
+
+    def test_prefill_logits(self):
+        models_to_test = MODELS
+
+        for model, tp_size, prefill_tolerance in models_to_test:
+            for torch_dtype in TORCH_DTYPES:
+                self.assert_close_prefill_logits(
+                    DEFAULT_PROMPTS, model, tp_size, torch_dtype, prefill_tolerance
+                )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/llm_models/gsm8k_ascend_mixin.py b/test/nightly/ascend/llm_models/gsm8k_ascend_mixin.py
new file mode 100644
index 000000000000..bcf3391f3361
--- /dev/null
+++ b/test/nightly/ascend/llm_models/gsm8k_ascend_mixin.py
@@ -0,0 +1,68 @@
+import os
+from abc import ABC
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+
+
+class GSM8KAscendMixin(ABC):
+    model = ""
+    accuracy = 0.00
+    other_args = [
+        "--trust-remote-code",
+        "--mem-fraction-static",
+        "0.8",
+        "--attention-backend",
+        "ascend",
+        "--disable-cuda-graph",
+    ]
+
+    @classmethod
+    def setUpClass(cls):
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        os.environ["PYTORCH_NPU_ALLOC_CONF"] = "expandable_segments:True"
+        os.environ["ASCEND_MF_STORE_URL"] = "tcp://127.0.0.1:24666"
+        os.environ["HCCL_BUFFSIZE"] = "200"
+        os.environ["SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK"] = "24"
+        os.environ["USE_VLLM_CUSTOM_ALLREDUCE"] = "1"
+        os.environ["HCCL_EXEC_TIMEOUT"] = "200"
+        os.environ["STREAMS_PER_DEVICE"] = "32"
+        os.environ["SGLANG_ENBLE_TORCH_COMILE"] = "1"
+        os.environ["AUTO_USE_UC_MEMORY"] = "0"
+        os.environ["P2P_HCCL_BUFFSIZE"] = "20"
+        env = os.environ.copy()
+
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=cls.other_args,
+            env=env,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        self.assertGreater(
+            metrics["accuracy"],
+            self.accuracy,
+            f'Accuracy of {self.model} is {str(metrics["accuracy"])}, is lower than {self.accuracy}',
+        )
diff --git a/test/nightly/ascend/llm_models/test_ascend_afm_4_5b.py b/test/nightly/ascend/llm_models/test_ascend_afm_4_5b.py
new file mode 100644
index 000000000000..78dddbdd20cd
--- /dev/null
+++ b/test/nightly/ascend/llm_models/test_ascend_afm_4_5b.py
@@ -0,0 +1,17 @@
+import unittest
+
+from gsm8k_ascend_mixin import GSM8KAscendMixin
+
+from sglang.test.ci.ci_register import register_npu_ci
+from sglang.test.test_utils import CustomTestCase
+
+register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True)
+
+
+class TestMistral7B(GSM8KAscendMixin, CustomTestCase):
+    model = "/root/.cache/modelscope/hub/models/arcee-ai/AFM-4.5B-Base"
+    accuracy = 0.00
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/llm_models/test_ascend_baichuan2_13b_chat.py b/test/nightly/ascend/llm_models/test_ascend_baichuan2_13b_chat.py
new file mode 100644
index 000000000000..c02c08378f5c
--- /dev/null
+++ b/test/nightly/ascend/llm_models/test_ascend_baichuan2_13b_chat.py
@@ -0,0 +1,22 @@
+import unittest
+
+from gsm8k_ascend_mixin import GSM8KAscendMixin
+
+from sglang.test.ci.ci_register import register_npu_ci
+from sglang.test.test_utils import CustomTestCase
+
+register_npu_ci(
+    est_time=400,
+    suite="nightly-1-npu-a3",
+    nightly=True,
+    disabled="The accuracy test result is 0.",
+)
+
+
+class TestMistral7B(GSM8KAscendMixin, CustomTestCase):
+    model = "/root/.cache/modelscope/hub/models/baichuan-inc/Baichuan2-13B-Chat"
+    accuracy = 0.00
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/llm_models/test_ascend_c4ai_command_r_v01.py b/test/nightly/ascend/llm_models/test_ascend_c4ai_command_r_v01.py
new file mode 100644
index 000000000000..5adb892fc239
--- /dev/null
+++ b/test/nightly/ascend/llm_models/test_ascend_c4ai_command_r_v01.py
@@ -0,0 +1,91 @@
+import os
+import unittest
+from types import SimpleNamespace
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.ci.ci_register import register_npu_ci
+from sglang.test.few_shot_gsm8k import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+register_npu_ci(
+    est_time=400,
+    suite="nightly-2-npu-a3",
+    nightly=True,
+    disabled="The accuracy test result is 0.",
+)
+
+
+class TestC4AI(CustomTestCase):
+    model = "/root/.cache/modelscope/hub/models/CohereForAI/c4ai-command-r-v01"
+    accuracy = 0.05
+
+    @classmethod
+    def setUpClass(cls):
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        chat_template_path = "/__w/sglang/sglang/test/nightly/ascend/llm_models/tool_chat_template_c4ai_command_r_v01.jinja"
+
+        other_args = [
+            "--trust-remote-code",
+            "--mem-fraction-static",
+            "0.8",
+            "--attention-backend",
+            "ascend",
+            "--disable-cuda-graph",
+            "--chat-template",
+            chat_template_path,
+            "--tp-size",
+            "2",
+            "--dtype",
+            "bfloat16",
+        ]
+        env = os.environ.copy()
+        env.update(
+            {
+                "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True",
+                "ASCEND_MF_STORE_URL": "tcp://127.0.0.1:24666",
+                "HCCL_BUFFSIZE": "200",
+                "SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK": "24",
+                "USE_VLLM_CUSTOM_ALLREDUCE": "1",
+                "HCCL_EXEC_TIMEOUT": "200",
+                "STREAMS_PER_DEVICE": "32",
+                "SGLANG_ENABLE_TORCH_COMPILE": "1",
+            }
+        )
+
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+            env=env,
+        )
+
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+
+    def test_gsm8k(self):
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval(args)
+        self.assertGreater(
+            metrics["accuracy"],
+            self.accuracy,
+            f'Accuracy of {self.model} is {str(metrics["accuracy"])}, is lower than {self.accuracy}',
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/llm_models/test_ascend_charglm2_6b.py b/test/nightly/ascend/llm_models/test_ascend_charglm2_6b.py
new file mode 100644
index 000000000000..2a7760d6cba5
--- /dev/null
+++ b/test/nightly/ascend/llm_models/test_ascend_charglm2_6b.py
@@ -0,0 +1,27 @@
+import unittest
+
+from gsm8k_ascend_mixin import GSM8KAscendMixin
+
+from sglang.test.ci.ci_register import register_npu_ci
+from sglang.test.test_utils import CustomTestCase
+
+register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True)
+
+
+class TestMistral7B(GSM8KAscendMixin, CustomTestCase):
+    model = "/root/.cache/modelscope/hub/models/ZhipuAI/chatglm2-6b"
+    accuracy = 0.25
+    other_args = [
+        "--trust-remote-code",
+        "--mem-fraction-static",
+        "0.8",
+        "--attention-backend",
+        "ascend",
+        "--disable-cuda-graph",
+        "--dtype",
+        "bfloat16",
+    ]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/llm_models/test_ascend_exaone_3.py b/test/nightly/ascend/llm_models/test_ascend_exaone_3.py
new file mode 100644
index 000000000000..133ad709f6b1
--- /dev/null
+++ b/test/nightly/ascend/llm_models/test_ascend_exaone_3.py
@@ -0,0 +1,27 @@
+import unittest
+
+from gsm8k_ascend_mixin import GSM8KAscendMixin
+
+from sglang.test.ci.ci_register import register_npu_ci
+from sglang.test.test_utils import CustomTestCase
+
+register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True)
+
+
+class TestMistral7B(GSM8KAscendMixin, CustomTestCase):
+    model = "/root/.cache/modelscope/hub/models/LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct"
+    accuracy = 0.00
+    other_args = [
+        "--trust-remote-code",
+        "--mem-fraction-static",
+        "0.8",
+        "--attention-backend",
+        "ascend",
+        "--disable-cuda-graph",
+        "--dtype",
+        "bfloat16",
+    ]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/llm_models/test_ascend_gemma_3_1b_it.py b/test/nightly/ascend/llm_models/test_ascend_gemma_3_1b_it.py
new file mode 100644
index 000000000000..f4789090ec9b
--- /dev/null
+++ b/test/nightly/ascend/llm_models/test_ascend_gemma_3_1b_it.py
@@ -0,0 +1,22 @@
+import unittest
+
+from gsm8k_ascend_mixin import GSM8KAscendMixin
+
+from sglang.test.ci.ci_register import register_npu_ci
+from sglang.test.test_utils import CustomTestCase
+
+register_npu_ci(
+    est_time=400,
+    suite="nightly-1-npu-a3",
+    nightly=True,
+    disabled="The accuracy test result is 0.",
+)
+
+
+class TestMistral7B(GSM8KAscendMixin, CustomTestCase):
+    model = "/root/.cache/modelscope/hub/models/LLM-Research/gemma-3-1b-it"
+    accuracy = 0.00
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/llm_models/test_ascend_glm4_9b_chat.py b/test/nightly/ascend/llm_models/test_ascend_glm4_9b_chat.py
new file mode 100644
index 000000000000..d3f4c6f70410
--- /dev/null
+++ b/test/nightly/ascend/llm_models/test_ascend_glm4_9b_chat.py
@@ -0,0 +1,17 @@
+import unittest
+
+from gsm8k_ascend_mixin import GSM8KAscendMixin
+
+from sglang.test.ci.ci_register import register_npu_ci
+from sglang.test.test_utils import CustomTestCase
+
+register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True)
+
+
+class TestGLM49BChat(GSM8KAscendMixin, CustomTestCase):
+    model = "/root/.cache/modelscope/hub/models/ZhipuAI/glm-4-9b-chat"
+    accuracy = 0.00
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/llm_models/test_ascend_granite_3_0_3b_a800m.py b/test/nightly/ascend/llm_models/test_ascend_granite_3_0_3b_a800m.py
new file mode 100644
index 000000000000..759691fc77a8
--- /dev/null
+++ b/test/nightly/ascend/llm_models/test_ascend_granite_3_0_3b_a800m.py
@@ -0,0 +1,19 @@
+import unittest
+
+from gsm8k_ascend_mixin import GSM8KAscendMixin
+
+from sglang.test.ci.ci_register import register_npu_ci
+from sglang.test.test_utils import CustomTestCase
+
+register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True)
+
+
+class TestMistral7B(GSM8KAscendMixin, CustomTestCase):
+    model = (
+        "/root/.cache/modelscope/hub/models/ibm-granite/granite-3.0-3b-a800m-instruct"
+    )
+    accuracy = 0.00
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/llm_models/test_ascend_granite_3_1_8b.py b/test/nightly/ascend/llm_models/test_ascend_granite_3_1_8b.py
new file mode 100644
index 000000000000..3e487b83c99e
--- /dev/null
+++ b/test/nightly/ascend/llm_models/test_ascend_granite_3_1_8b.py
@@ -0,0 +1,17 @@
+import unittest
+
+from gsm8k_ascend_mixin import GSM8KAscendMixin
+
+from sglang.test.ci.ci_register import register_npu_ci
+from sglang.test.test_utils import CustomTestCase
+
+register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True)
+
+
+class TestMistral7B(GSM8KAscendMixin, CustomTestCase):
+    model = "/root/.cache/modelscope/hub/models/ibm-granite/granite-3.1-8b-instruct"
+    accuracy = 0.695
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/llm_models/test_ascend_internlm2_7b.py b/test/nightly/ascend/llm_models/test_ascend_internlm2_7b.py
new file mode 100644
index 000000000000..be1dca99f6bb
--- /dev/null
+++ b/test/nightly/ascend/llm_models/test_ascend_internlm2_7b.py
@@ -0,0 +1,17 @@
+import unittest
+
+from gsm8k_ascend_mixin import GSM8KAscendMixin
+
+from sglang.test.ci.ci_register import register_npu_ci
+from sglang.test.test_utils import CustomTestCase
+
+register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True)
+
+
+class TestMistral7B(GSM8KAscendMixin, CustomTestCase):
+    model = "/root/.cache/modelscope/hub/models/Shanghai_AI_Laboratory/internlm2-7b"
+    accuracy = 0.6
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/llm_models/test_ascend_ling_lite.py b/test/nightly/ascend/llm_models/test_ascend_ling_lite.py
new file mode 100644
index 000000000000..c2af52fc9359
--- /dev/null
+++ b/test/nightly/ascend/llm_models/test_ascend_ling_lite.py
@@ -0,0 +1,17 @@
+import unittest
+
+from gsm8k_ascend_mixin import GSM8KAscendMixin
+
+from sglang.test.ci.ci_register import register_npu_ci
+from sglang.test.test_utils import CustomTestCase
+
+register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True)
+
+
+class TestMistral7B(GSM8KAscendMixin, CustomTestCase):
+    model = "/root/.cache/modelscope/hub/models/inclusionAI/Ling-lite"
+    accuracy = 0.75
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/llm_models/test_ascend_llama_2_7b.py b/test/nightly/ascend/llm_models/test_ascend_llama_2_7b.py
new file mode 100644
index 000000000000..97fefada6d97
--- /dev/null
+++ b/test/nightly/ascend/llm_models/test_ascend_llama_2_7b.py
@@ -0,0 +1,17 @@
+import unittest
+
+from gsm8k_ascend_mixin import GSM8KAscendMixin
+
+from sglang.test.ci.ci_register import register_npu_ci
+from sglang.test.test_utils import CustomTestCase
+
+register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True)
+
+
+class TestMistral7B(GSM8KAscendMixin, CustomTestCase):
+    model = "/root/.cache/modelscope/hub/models/LLM-Research/Llama-2-7B"
+    accuracy = 0.18
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/llm_models/test_ascend_mimo_7b_rl.py b/test/nightly/ascend/llm_models/test_ascend_mimo_7b_rl.py
new file mode 100644
index 000000000000..c025523cc731
--- /dev/null
+++ b/test/nightly/ascend/llm_models/test_ascend_mimo_7b_rl.py
@@ -0,0 +1,17 @@
+import unittest
+
+from gsm8k_ascend_mixin import GSM8KAscendMixin
+
+from sglang.test.ci.ci_register import register_npu_ci
+from sglang.test.test_utils import CustomTestCase
+
+register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True)
+
+
+class TestMistral7B(GSM8KAscendMixin, CustomTestCase):
+    model = "/root/.cache/modelscope/hub/models/XiaomiMiMo/MiMo-7B-RL"
+    accuracy = 0.75
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/llm_models/test_ascend_mistral_7b.py b/test/nightly/ascend/llm_models/test_ascend_mistral_7b.py
new file mode 100644
index 000000000000..072f86474d64
--- /dev/null
+++ b/test/nightly/ascend/llm_models/test_ascend_mistral_7b.py
@@ -0,0 +1,17 @@
+import unittest
+
+from gsm8k_ascend_mixin import GSM8KAscendMixin
+
+from sglang.test.ci.ci_register import register_npu_ci
+from sglang.test.test_utils import CustomTestCase
+
+register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True)
+
+
+class TestMistral7B(GSM8KAscendMixin, CustomTestCase):
+    model = "/root/.cache/modelscope/hub/models/mistralai/Mistral-7B-Instruct-v0.2"
+    accuracy = 0.375
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/llm_models/test_ascend_persimmon_8b_chat.py b/test/nightly/ascend/llm_models/test_ascend_persimmon_8b_chat.py
new file mode 100644
index 000000000000..174f14dfafb8
--- /dev/null
+++ b/test/nightly/ascend/llm_models/test_ascend_persimmon_8b_chat.py
@@ -0,0 +1,17 @@
+import unittest
+
+from gsm8k_ascend_mixin import GSM8KAscendMixin
+
+from sglang.test.ci.ci_register import register_npu_ci
+from sglang.test.test_utils import CustomTestCase
+
+register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True)
+
+
+class TestMistral7B(GSM8KAscendMixin, CustomTestCase):
+    model = "/root/.cache/modelscope/hub/models/Howeee/persimmon-8b-chat"
+    accuracy = 0.17
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/llm_models/test_ascend_phi_4_multimodal.py b/test/nightly/ascend/llm_models/test_ascend_phi_4_multimodal.py
new file mode 100644
index 000000000000..d4ebeca485ea
--- /dev/null
+++ b/test/nightly/ascend/llm_models/test_ascend_phi_4_multimodal.py
@@ -0,0 +1,17 @@
+import unittest
+
+from gsm8k_ascend_mixin import GSM8KAscendMixin
+
+from sglang.test.ci.ci_register import register_npu_ci
+from sglang.test.test_utils import CustomTestCase
+
+register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True)
+
+
+class TestMistral7B(GSM8KAscendMixin, CustomTestCase):
+    model = "/root/.cache/modelscope/hub/models/LLM-Research/Phi-4-multimodal-instruct"
+    accuracy = 0.8
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/llm_models/test_ascend_smollm_1_7b.py b/test/nightly/ascend/llm_models/test_ascend_smollm_1_7b.py
new file mode 100644
index 000000000000..652934b4ac9f
--- /dev/null
+++ b/test/nightly/ascend/llm_models/test_ascend_smollm_1_7b.py
@@ -0,0 +1,27 @@
+import unittest
+
+from gsm8k_ascend_mixin import GSM8KAscendMixin
+
+from sglang.test.ci.ci_register import register_npu_ci
+from sglang.test.test_utils import CustomTestCase
+
+register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True)
+
+
+class TestMistral7B(GSM8KAscendMixin, CustomTestCase):
+    model = "/root/.cache/modelscope/hub/models/HuggingFaceTB/SmolLM-1.7B"
+    accuracy = 0.05
+    other_args = [
+        "--trust-remote-code",
+        "--mem-fraction-static",
+        "0.8",
+        "--attention-backend",
+        "ascend",
+        "--disable-cuda-graph",
+        "--dtype",
+        "bfloat16",
+    ]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/llm_models/tool_chat_template_c4ai_command_r_v01.jinja b/test/nightly/ascend/llm_models/tool_chat_template_c4ai_command_r_v01.jinja
new file mode 100644
index 000000000000..638ce5ef2fb9
--- /dev/null
+++ b/test/nightly/ascend/llm_models/tool_chat_template_c4ai_command_r_v01.jinja
@@ -0,0 +1 @@
+{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}
diff --git a/test/nightly/ascend/rerank_models/test_ascend_cross_encoder_models.py b/test/nightly/ascend/rerank_models/test_ascend_cross_encoder_models.py
new file mode 100644
index 000000000000..8593bf367cc8
--- /dev/null
+++ b/test/nightly/ascend/rerank_models/test_ascend_cross_encoder_models.py
@@ -0,0 +1,92 @@
+import multiprocessing as mp
+import unittest
+
+import torch
+
+from sglang.test.ci.ci_register import register_npu_ci
+from sglang.test.runners import TEST_RERANK_QUERY_DOCS, HFRunner, SRTRunner
+from sglang.test.test_utils import CustomTestCase
+
+register_npu_ci(
+    est_time=400,
+    suite="nightly-1-npu-a3",
+    nightly=True,
+    disabled="cross encoder scores are not all close",
+)
+
+MODELS = [
+    ("/root/.cache/modelscope/hub/models/BAAI/bge-reranker-v2-m3", 1, 1e-2),
+]
+ATTENTION_BACKEND = ["ascend"]
+TORCH_DTYPES = [torch.bfloat16]
+
+
+class TestCrossEncoderModels(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def assert_close_prefill_logits(
+        self,
+        prompts,
+        model_path,
+        tp_size,
+        torch_dtype,
+        score_tolerance,
+        attention_backend,
+    ) -> None:
+        with HFRunner(
+            model_path,
+            torch_dtype=torch_dtype,
+            model_type="cross_encoder",
+        ) as hf_runner:
+            hf_scores = hf_runner.forward(prompts).scores
+
+        with SRTRunner(
+            model_path,
+            tp_size=tp_size,
+            torch_dtype=torch_dtype,
+            model_type="cross_encoder",
+            attention_backend=attention_backend,
+            chunked_prefill_size=-1,
+            disable_radix_cache=True,
+        ) as srt_runner:
+            srt_scores = srt_runner.forward(prompts).scores
+
+        for i in range(len(srt_scores)):
+            score_difference = abs(hf_scores[i] - srt_scores[i])
+
+            assert (
+                score_difference < score_tolerance
+            ), "cross encoder scores are not all close"
+
+    def preprocess_prompts(self, prompt):
+        processed_prompts = []
+        query = prompt["query"]
+        documents = prompt["documents"]
+        for document in documents:
+            processed_prompts.append([query, document])
+
+        return processed_prompts
+
+    def test_prefill_logits(self):
+        models_to_test = MODELS
+
+        for model, tp_size, prefill_tolerance in models_to_test:
+            for attention_backend in ATTENTION_BACKEND:
+                for queryDocs in TEST_RERANK_QUERY_DOCS:
+                    prompts = self.preprocess_prompts(queryDocs)
+                    for torch_dtype in TORCH_DTYPES:
+                        self.assert_close_prefill_logits(
+                            prompts,
+                            model,
+                            tp_size,
+                            torch_dtype,
+                            prefill_tolerance,
+                            attention_backend,
+                        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/vlm_models/mmmu-val.yaml b/test/nightly/ascend/vlm_models/mmmu-val.yaml
new file mode 100644
index 000000000000..e63c76e08a40
--- /dev/null
+++ b/test/nightly/ascend/vlm_models/mmmu-val.yaml
@@ -0,0 +1 @@
+dataset_path: /root/.cache/huggingface/hub/datasets--lmms-lab--MMMU/snapshots/364f2e2eb107b36e07ff4c5a15f5947a759cef47
diff --git a/test/nightly/ascend/vlm_models/test_ascend_gemma_3_4b_it.py b/test/nightly/ascend/vlm_models/test_ascend_gemma_3_4b_it.py
new file mode 100644
index 000000000000..ee68f4b4df0d
--- /dev/null
+++ b/test/nightly/ascend/vlm_models/test_ascend_gemma_3_4b_it.py
@@ -0,0 +1,19 @@
+import unittest
+
+from test_vlm_utils import TestVLMModels
+
+from sglang.test.ci.ci_register import register_npu_ci
+
+register_npu_ci(est_time=400, suite="nightly-4-npu-a3", nightly=True)
+
+
+class TestGemmaModels(TestVLMModels):
+    model = "/root/.cache/modelscope/hub/models/google/gemma-3-4b-it"
+    mmmu_accuracy = 0.2
+
+    def test_vlm_mmmu_benchmark(self):
+        self._run_vlm_mmmu_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/vlm_models/test_ascend_janus_pro_1b.py b/test/nightly/ascend/vlm_models/test_ascend_janus_pro_1b.py
new file mode 100644
index 000000000000..da6c4d13d74a
--- /dev/null
+++ b/test/nightly/ascend/vlm_models/test_ascend_janus_pro_1b.py
@@ -0,0 +1,19 @@
+import unittest
+
+from test_vlm_utils import TestVLMModels
+
+from sglang.test.ci.ci_register import register_npu_ci
+
+register_npu_ci(est_time=400, suite="nightly-4-npu-a3", nightly=True)
+
+
+class TestGemmaModels(TestVLMModels):
+    model = "/root/.cache/modelscope/hub/models/deepseek-ai/Janus-Pro-1B"
+    mmmu_accuracy = 0.2
+
+    def test_vlm_mmmu_benchmark(self):
+        self._run_vlm_mmmu_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/vlm_models/test_ascend_janus_pro_7b.py b/test/nightly/ascend/vlm_models/test_ascend_janus_pro_7b.py
new file mode 100644
index 000000000000..7e85b4f30b22
--- /dev/null
+++ b/test/nightly/ascend/vlm_models/test_ascend_janus_pro_7b.py
@@ -0,0 +1,19 @@
+import unittest
+
+from test_vlm_utils import TestVLMModels
+
+from sglang.test.ci.ci_register import register_npu_ci
+
+register_npu_ci(est_time=400, suite="nightly-4-npu-a3", nightly=True)
+
+
+class TestJanusPro7B(TestVLMModels):
+    model = "/root/.cache/modelscope/hub/models/deepseek-ai/Janus-Pro-7B"
+    mmmu_accuracy = 0.2
+
+    def test_vlm_mmmu_benchmark(self):
+        self._run_vlm_mmmu_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/vlm_models/test_ascend_mimo_vl_7b_rl.py b/test/nightly/ascend/vlm_models/test_ascend_mimo_vl_7b_rl.py
new file mode 100644
index 000000000000..fccc5bb4105a
--- /dev/null
+++ b/test/nightly/ascend/vlm_models/test_ascend_mimo_vl_7b_rl.py
@@ -0,0 +1,19 @@
+import unittest
+
+from test_vlm_utils import TestVLMModels
+
+from sglang.test.ci.ci_register import register_npu_ci
+
+register_npu_ci(est_time=400, suite="nightly-4-npu-a3", nightly=True)
+
+
+class TestGemmaModels(TestVLMModels):
+    model = "/root/.cache/modelscope/hub/models/XiaomiMiMo/MiMo-VL-7B-RL"
+    mmmu_accuracy = 0.2
+
+    def test_vlm_mmmu_benchmark(self):
+        self._run_vlm_mmmu_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/vlm_models/test_ascend_minicpm_o_2_6.py b/test/nightly/ascend/vlm_models/test_ascend_minicpm_o_2_6.py
new file mode 100644
index 000000000000..048669280347
--- /dev/null
+++ b/test/nightly/ascend/vlm_models/test_ascend_minicpm_o_2_6.py
@@ -0,0 +1,19 @@
+import unittest
+
+from test_vlm_utils import TestVLMModels
+
+from sglang.test.ci.ci_register import register_npu_ci
+
+register_npu_ci(est_time=400, suite="nightly-4-npu-a3", nightly=True)
+
+
+class TestGemmaModels(TestVLMModels):
+    model = "/root/.cache/modelscope/hub/models/openbmb/MiniCPM-o-2_6"
+    mmmu_accuracy = 0.2
+
+    def test_vlm_mmmu_benchmark(self):
+        self._run_vlm_mmmu_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/vlm_models/test_ascend_minicpm_v_2_6.py b/test/nightly/ascend/vlm_models/test_ascend_minicpm_v_2_6.py
new file mode 100644
index 000000000000..3ed6fb15fec4
--- /dev/null
+++ b/test/nightly/ascend/vlm_models/test_ascend_minicpm_v_2_6.py
@@ -0,0 +1,19 @@
+import unittest
+
+from test_vlm_utils import TestVLMModels
+
+from sglang.test.ci.ci_register import register_npu_ci
+
+register_npu_ci(est_time=400, suite="nightly-4-npu-a3", nightly=True)
+
+
+class TestGemmaModels(TestVLMModels):
+    model = "/root/.cache/modelscope/hub/models/openbmb/MiniCPM-V-2_6"
+    mmmu_accuracy = 0.2
+
+    def test_vlm_mmmu_benchmark(self):
+        self._run_vlm_mmmu_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/vlm_models/test_ascend_phi4_multimodal_instruct.py b/test/nightly/ascend/vlm_models/test_ascend_phi4_multimodal_instruct.py
new file mode 100644
index 000000000000..191b541a6760
--- /dev/null
+++ b/test/nightly/ascend/vlm_models/test_ascend_phi4_multimodal_instruct.py
@@ -0,0 +1,19 @@
+import unittest
+
+from test_vlm_utils import TestVLMModels
+
+from sglang.test.ci.ci_register import register_npu_ci
+
+register_npu_ci(est_time=400, suite="nightly-4-npu-a3", nightly=True)
+
+
+class TestGemmaModels(TestVLMModels):
+    model = "/root/.cache/modelscope/hub/models/microsoft/Phi-4-multimodal-instruct"
+    mmmu_accuracy = 0.2
+
+    def test_vlm_mmmu_benchmark(self):
+        self._run_vlm_mmmu_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/vlm_models/test_ascend_qwen2_5_vl_3b_instruct.py b/test/nightly/ascend/vlm_models/test_ascend_qwen2_5_vl_3b_instruct.py
new file mode 100644
index 000000000000..dda18933bcf8
--- /dev/null
+++ b/test/nightly/ascend/vlm_models/test_ascend_qwen2_5_vl_3b_instruct.py
@@ -0,0 +1,19 @@
+import unittest
+
+from test_vlm_utils import TestVLMModels
+
+from sglang.test.ci.ci_register import register_npu_ci
+
+register_npu_ci(est_time=400, suite="nightly-4-npu-a3", nightly=True)
+
+
+class TestGemmaModels(TestVLMModels):
+    model = "/root/.cache/modelscope/hub/models/Qwen/Qwen2.5-VL-3B-Instruct"
+    mmmu_accuracy = 0.2
+
+    def test_vlm_mmmu_benchmark(self):
+        self._run_vlm_mmmu_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/nightly/ascend/vlm_models/test_vlm_utils.py b/test/nightly/ascend/vlm_models/test_vlm_utils.py
new file mode 100644
index 000000000000..6c5eae64b2dd
--- /dev/null
+++ b/test/nightly/ascend/vlm_models/test_vlm_utils.py
@@ -0,0 +1,217 @@
+import glob
+import json
+import os
+import subprocess
+
+from sglang.srt.utils import kill_process_tree
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    popen_launch_server,
+)
+
+
+class TestVLMModels(CustomTestCase):
+    model = ""
+    mmmu_accuracy = 0.00
+    other_args = [
+        "--trust-remote-code",
+        "--cuda-graph-max-bs",
+        "32",
+        "--enable-multimodal",
+        "--mem-fraction-static",
+        0.35,
+        "--log-level",
+        "info",
+        "--attention-backend",
+        "ascend",
+        "--disable-cuda-graph",
+        "--tp-size",
+        4,
+    ]
+
+    @classmethod
+    def setUpClass(cls):
+        # Removed argument parsing from here
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.api_key = "sk-123456"
+        cls.time_out = DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
+
+        # Set OpenAI API key and base URL environment variables. Needed for lmm-evals to work.
+        os.environ["OPENAI_API_KEY"] = cls.api_key
+        os.environ["OPENAI_API_BASE"] = f"{cls.base_url}/v1"
+
+    def run_mmmu_eval(
+        self,
+        model_version: str,
+        output_path: str,
+        limit: str,
+        *,
+        env: dict | None = None,
+    ):
+        """
+        Evaluate a VLM on the MMMU validation set with lmms‑eval.
+        Only `model_version` (checkpoint) and `chat_template` vary;
+        We are focusing only on the validation set due to resource constraints.
+        """
+        # -------- fixed settings --------
+        model = "openai_compatible"
+        tp = 1
+        tasks = "mmmu_val"
+        batch_size = 2
+        log_suffix = "openai_compatible"
+        os.makedirs(output_path, exist_ok=True)
+
+        # -------- compose --model_args --------
+        model_args = f'model_version="{model_version}",' f"tp={tp}"
+
+        # -------- build command list --------
+        cmd = [
+            "python3",
+            "-m",
+            "lmms_eval",
+            "--model",
+            model,
+            "--model_args",
+            model_args,
+            "--tasks",
+            tasks,
+            "--batch_size",
+            str(batch_size),
+            "--log_samples",
+            "--log_samples_suffix",
+            log_suffix,
+            "--output_path",
+            str(output_path),
+            "--limit",
+            limit,
+            "--config",
+            "/__w/sglang/sglang/test/nightly/ascend/vlm_models/mmmu-val.yaml",
+        ]
+
+        subprocess.run(
+            cmd,
+            check=True,
+            timeout=3600,
+        )
+
+    def _run_vlm_mmmu_test(
+        self,
+        output_path="./logs",
+        test_name="",
+        custom_env=None,
+        capture_output=False,
+        limit="50",
+    ):
+        """
+        Common method to run VLM MMMU benchmark test.
+        Args:
+            model: Model to test
+            output_path: Path for output logs
+            test_name: Optional test name for logging
+            custom_env: Optional custom environment variables
+            capture_output: Whether to capture server stdout/stderr
+        """
+        print(f"\nTesting model: {self.model}{test_name}")
+
+        process = None
+        server_output = ""
+
+        try:
+            # Prepare environment variables
+            process_env = os.environ.copy()
+            if custom_env:
+                process_env.update(custom_env)
+
+            # Prepare stdout/stderr redirection if needed
+            stdout_file = None
+            stderr_file = None
+            if capture_output:
+                stdout_file = open("/tmp/server_stdout.log", "w")
+                stderr_file = open("/tmp/server_stderr.log", "w")
+
+            process = popen_launch_server(
+                self.model,
+                base_url=self.base_url,
+                timeout=self.time_out,
+                api_key=self.api_key,
+                other_args=self.other_args,
+                env=process_env,
+                return_stdout_stderr=(
+                    (stdout_file, stderr_file) if capture_output else None
+                ),
+            )
+
+            # Run evaluation
+            self.run_mmmu_eval(self.model, output_path, limit)
+
+            # Get the result file
+            result_file_path = glob.glob(f"{output_path}/*.json")[0]
+
+            with open(result_file_path, "r") as f:
+                result = json.load(f)
+                print(f"Result{test_name}\n: {result}")
+
+            # Process the result
+            mmmu_accuracy = result["results"]["mmmu_val"]["mmmu_acc,none"]
+            print(
+                f"Model {self.model} achieved accuracy{test_name}: {mmmu_accuracy:.4f}"
+            )
+
+            # Capture server output if requested
+            if capture_output and process:
+                server_output = self._read_output_from_files()
+
+            # Assert performance meets expected threshold
+            self.assertGreaterEqual(
+                mmmu_accuracy,
+                self.mmmu_accuracy,
+                f"Model {self.model} accuracy ({mmmu_accuracy:.4f}) below expected threshold ({self.mmmu_accuracy:.4f}){test_name}",
+            )
+
+            return server_output
+
+        except Exception as e:
+            print(f"Error testing {self.model}{test_name}: {e}")
+            self.fail(f"Test failed for {self.model}{test_name}: {e}")
+
+        finally:
+            # Ensure process cleanup happens regardless of success/failure
+            if process is not None and process.poll() is None:
+                print(f"Cleaning up process {process.pid}")
+                try:
+                    kill_process_tree(process.pid)
+                except Exception as e:
+                    print(f"Error killing process: {e}")
+
+            # clean up temporary files
+            if capture_output:
+                if stdout_file:
+                    stdout_file.close()
+                if stderr_file:
+                    stderr_file.close()
+                for filename in ["/tmp/server_stdout.log", "/tmp/server_stderr.log"]:
+                    try:
+                        if os.path.exists(filename):
+                            os.remove(filename)
+                    except Exception as e:
+                        print(f"Error removing {filename}: {e}")
+
+    def _read_output_from_files(self):
+        output_lines = []
+
+        log_files = [
+            ("/tmp/server_stdout.log", "[STDOUT]"),
+            ("/tmp/server_stderr.log", "[STDERR]"),
+        ]
+        for filename, tag in log_files:
+            try:
+                if os.path.exists(filename):
+                    with open(filename, "r") as f:
+                        for line in f:
+                            output_lines.append(f"{tag} {line.rstrip()}")
+            except Exception as e:
+                print(f"Error reading {tag.lower()} file: {e}")
+
+        return "\n".join(output_lines)
diff --git a/test/run_suite.py b/test/run_suite.py
index c903f1ab23bf..461a30aa0736 100644
--- a/test/run_suite.py
+++ b/test/run_suite.py
@@ -10,6 +10,7 @@
     "cpu": HWBackend.CPU,
     "cuda": HWBackend.CUDA,
     "amd": HWBackend.AMD,
+    "npu": HWBackend.NPU,
 }
 
 # Per-commit test suites (run on every PR)
@@ -17,6 +18,7 @@
     HWBackend.CPU: ["default"],
     HWBackend.AMD: ["stage-a-test-1"],
     HWBackend.CUDA: ["stage-a-test-1"],
+    HWBackend.NPU: [],
 }
 
 # Nightly test suites (run nightly, organized by GPU configuration)
@@ -33,6 +35,12 @@
     ],
     HWBackend.AMD: ["nightly-amd"],
     HWBackend.CPU: [],
+    HWBackend.NPU: [
+        "nightly-1-npu-a3",
+        "nightly-2-npu-a3",
+        "nightly-4-npu-a3",
+        "nightly-16-npu-a3",
+    ],
 }