diff --git a/.github/workflows/nightly-test-npu.yml b/.github/workflows/nightly-test-npu.yml new file mode 100644 index 000000000000..68689265d691 --- /dev/null +++ b/.github/workflows/nightly-test-npu.yml @@ -0,0 +1,186 @@ +name: Nightly Test (NPU) + +on: + schedule: + - cron: '0 17 * * *' # Execute at 1:00 a.m. Beijing Time every day + pull_request: + branches: + - main + paths: + - ".github/workflows/nightly-test-npu.yml" + workflow_dispatch: + +concurrency: + group: nightly-test-npu-${{ github.ref }} + cancel-in-progress: true + +jobs: + nightly-1-npu-a3: + if: ${{ (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') }} + runs-on: linux-aarch64-a3-2 + strategy: + fail-fast: false + matrix: + part: [0, 1] + container: + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-a3-ubuntu22.04-py3.11 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + # speed up by using infra cache services + CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local" + sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list + pip config set global.index-url http://${CACHING_URL}/pypi/simple + pip config set global.extra-index-url "https://pypi.tuna.tsinghua.edu.cn/simple" + pip config set global.trusted-host "${CACHING_URL} pypi.tuna.tsinghua.edu.cn" + + bash scripts/ci/npu_ci_install_dependency.sh a3 + # copy required file from our daily cache + cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp + # copy download through proxy + curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl + + - name: Print Log Information + run: | + bash scripts/ci/npu_log_print.sh + - name: Run test + timeout-minutes: 240 + env: + SGLANG_USE_MODELSCOPE: true + SGLANG_IS_IN_CI: true + HF_ENDPOINT: https://hf-mirror.com + TORCH_EXTENSIONS_DIR: /tmp/torch_extensions + PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" + STREAMS_PER_DEVICE: 32 + run: | + export PATH="/usr/local/Ascend/8.3.RC1/compiler/bishengir/bin:${PATH}" + pip install sentence_transformers accelerate + cd test + python3 run_suite.py --hw npu --suite nightly-1-npu-a3 --nightly --continue-on-error --timeout-per-file 3600 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 + + nightly-2-npu-a3: + if: ${{ (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') }} + runs-on: linux-aarch64-a3-2 + strategy: + fail-fast: false + matrix: + part: [0] + container: + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-a3-ubuntu22.04-py3.11 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + # speed up by using infra cache services + CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local" + sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list + pip config set global.index-url http://${CACHING_URL}/pypi/simple + pip config set global.extra-index-url "https://pypi.tuna.tsinghua.edu.cn/simple" + pip config set global.trusted-host "${CACHING_URL} pypi.tuna.tsinghua.edu.cn" + + bash scripts/ci/npu_ci_install_dependency.sh a3 + # copy required file from our daily cache + cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp + # copy download through proxy + curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl + + - name: Print Log Information + run: | + bash scripts/ci/npu_log_print.sh + - name: Run test + timeout-minutes: 240 + env: + SGLANG_USE_MODELSCOPE: true + SGLANG_IS_IN_CI: true + HF_ENDPOINT: https://hf-mirror.com + TORCH_EXTENSIONS_DIR: /tmp/torch_extensions + PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" + STREAMS_PER_DEVICE: 32 + run: | + export PATH="/usr/local/Ascend/8.3.RC1/compiler/bishengir/bin:${PATH}" + pip install sentence_transformers accelerate + cd test + python3 run_suite.py --hw npu --suite nightly-2-npu-a3 --nightly --continue-on-error --timeout-per-file 3600 --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 + + nightly-4-npu-a3: + if: ${{ (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') }} + runs-on: linux-aarch64-a3-4 + strategy: + fail-fast: false + matrix: + part: [0] + container: + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-a3-ubuntu22.04-py3.11 + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Install dependencies + run: | + # speed up by using infra cache services + CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local" + sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list + pip config set global.index-url http://${CACHING_URL}/pypi/simple + pip config set global.extra-index-url "https://pypi.tuna.tsinghua.edu.cn/simple" + pip config set global.trusted-host "${CACHING_URL} pypi.tuna.tsinghua.edu.cn" + + bash scripts/ci/npu_ci_install_dependency.sh a3 + # copy required file from our daily cache + cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp + # copy download through proxy + curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl + + - name: Print Log Information + run: | + bash scripts/ci/npu_log_print.sh + + - name: Run test + timeout-minutes: 240 + env: + SGLANG_USE_MODELSCOPE: true + SGLANG_IS_IN_CI: true + HF_ENDPOINT: https://hf-mirror.com + TORCH_EXTENSIONS_DIR: /tmp/torch_extensions + PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True" + STREAMS_PER_DEVICE: 32 + run: | + export PATH="/usr/local/Ascend/8.3.RC1/compiler/bishengir/bin:${PATH}" + hf download lmms-lab/MMMU --repo-type dataset + pip install sentence_transformers torchaudio==2.8.0 torch_npu==2.8.0 + pip install protobuf==6.31.1 zss pre-commit wandb>=0.16.0 tenacity==8.3.0 loguru openpyxl latex2sympy2 zstandard transformers-stream-generator tqdm-multiprocess pycocoevalcap + pip install yt-dlp sentencepiece==0.1.99 nltk av ftfy sqlitedict==2.1.0 sacrebleu>=1.5.0 pytablewriter peft==0.2.0 black==24.1.0 isort==5.13.2 peft>=0.2.0 accelerate>=0.29.1 + pip install jsonlines httpx==0.25.0 evaluate>=0.4.0 datasets==2.16.1 numexpr xgrammar==0.1.25 numpy==1.26.4 dotenv + git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git + cd ./lmms-eval + nohup pip install . > lmmslog.txt 2>&1 & + sleep 120 + export PYTHONPATH=$PYTHONPATH:$(pwd) + cd ../ + cd test + python3 run_suite.py --hw npu --suite nightly-4-npu-a3 --nightly --continue-on-error --timeout-per-file 3600 --auto-partition-id ${{ matrix.part }} --auto-partition-size 1 + + check-all-jobs: + if: github.repository == 'sgl-project/sglang' && always() + needs: + - nightly-1-npu-a3 + - nightly-4-npu-a3 + runs-on: ubuntu-latest + container: + image: docker.m.daocloud.io/ubuntu:22.04 + steps: + - name: Check if any job failed + run: | + if [[ "${{ contains(needs.*.result, 'failure') }}" == "true" ]]; then + echo "One or more nightly test jobs failed" + exit 1 + fi + if [[ "${{ contains(needs.*.result, 'cancelled') }}" == "true" ]]; then + echo "One or more nightly test jobs were cancelled" + exit 1 + fi + echo "All nightly test jobs passed" diff --git a/python/sglang/test/ci/ci_register.py b/python/sglang/test/ci/ci_register.py index 45023fc12b2e..fdd72de44955 100644 --- a/python/sglang/test/ci/ci_register.py +++ b/python/sglang/test/ci/ci_register.py @@ -11,6 +11,7 @@ "register_cpu_ci", "register_cuda_ci", "register_amd_ci", + "register_npu_ci", "ut_parse_one_file", ] @@ -22,6 +23,7 @@ class HWBackend(Enum): CPU = auto() CUDA = auto() AMD = auto() + NPU = auto() @dataclass @@ -58,10 +60,21 @@ def register_amd_ci( return None +def register_npu_ci( + est_time: float, + suite: str, + nightly: bool = False, + disabled: Optional[str] = None, +): + """Marker for NPU CI registration (parsed via AST; runtime no-op).""" + return None + + REGISTER_MAPPING = { "register_cpu_ci": HWBackend.CPU, "register_cuda_ci": HWBackend.CUDA, "register_amd_ci": HWBackend.AMD, + "register_npu_ci": HWBackend.NPU, } diff --git a/python/sglang/test/runners.py b/python/sglang/test/runners.py index e174eb0c2f39..6d31b5868a47 100644 --- a/python/sglang/test/runners.py +++ b/python/sglang/test/runners.py @@ -31,10 +31,15 @@ ) from sglang.srt.entrypoints.engine import Engine -from sglang.srt.utils import load_image +from sglang.srt.utils import is_npu, load_image from sglang.srt.utils.hf_transformers_utils import get_tokenizer from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, calculate_rouge_l +if is_npu(): + from sglang.srt.hardware_backend.npu.utils import init_npu_backend + + init_npu_backend() + DEFAULT_PROMPTS = [ "Apple is red. Banana is Yellow. " * 800 + "Apple is", "The capital of the United Kingdom is", @@ -72,6 +77,8 @@ def get_dtype_str(torch_dtype): return "float16" if torch_dtype is torch.float32: return "float32" + if torch_dtype is torch.bfloat16: + return "bfloat16" else: raise NotImplementedError() diff --git a/scripts/ci/npu_log_print.sh b/scripts/ci/npu_log_print.sh new file mode 100755 index 000000000000..ad59bc594cbc --- /dev/null +++ b/scripts/ci/npu_log_print.sh @@ -0,0 +1,26 @@ +#!/bin/bash +set -euo pipefail + +# Print log information(sglang version, commit sha, sgl-kernel-npu version, sgl-kernel-npu commit sha, npu-smi info and pip list. +npu-smi info +pip list +get_version() { + [ -f "$1" ] && python3 -c 'import re, sys; print(sys.argv[2] + " version: v" + re.search(r"__version__\s*=\s*[\"'"'"'](.*?)[\"'"'"']", open(sys.argv[1]).read()).group(1))' "$1" "$2" 2>/dev/null || echo "$2 version: unknown" +} +get_version "./python/sglang/version.py" "sglang" +get_version "./sgl-kernel/python/sgl_kernel/version.py" "sgl_kernel" +SGLANG_URL="https://github.com/sgl-project/sglang.git" +SGL_KERNEL_URL="https://github.com/sgl-project/sgl-kernel-npu.git" +SGLANG_BRANCH="main" +SGL_KERNEL_BRANCH="main" +get_sha() { + local name="$1" + local url="$2" + local branch="$3" + local sha + sha=$(git ls-remote "$url" "refs/heads/$branch" | cut -f1) + echo "$name SHA for branch $branch: ${sha:-"Not Found"}" +} +get_sha "sglang" "$SGLANG_URL" "$SGLANG_BRANCH" +get_sha "sgl-kernel" "$SGL_KERNEL_URL" "$SGL_KERNEL_BRANCH" +chmod +x scripts/ci/npu_log_print.sh diff --git a/test/nightly/ascend/embedding_models/test_ascend_embedding_models.py b/test/nightly/ascend/embedding_models/test_ascend_embedding_models.py new file mode 100644 index 000000000000..33c5e3cfd91e --- /dev/null +++ b/test/nightly/ascend/embedding_models/test_ascend_embedding_models.py @@ -0,0 +1,108 @@ +import multiprocessing as mp +import unittest +from typing import Optional + +import torch +from transformers import AutoConfig, AutoTokenizer + +from sglang.test.ci.ci_register import register_npu_ci +from sglang.test.runners import DEFAULT_PROMPTS, HFRunner, SRTRunner +from sglang.test.test_utils import CustomTestCase, get_similarities + +register_npu_ci( + est_time=400, + suite="nightly-1-npu-a3", + nightly=True, + disabled="embeddings are not all close", +) + + +MODELS = [ + ("/root/.cache/modelscope/hub/models/iic/gte_Qwen2-1.5B-instruct", 1, 1e-5), + ("/root/.cache/modelscope/hub/models/Qwen/Qwen3-Embedding-8B", 1, 1e-5), +] +TORCH_DTYPES = [torch.bfloat16] + + +class TestEmbeddingModels(CustomTestCase): + + @classmethod + def setUpClass(cls): + mp.set_start_method("spawn", force=True) + + def _truncate_prompts(self, prompts, model_path): + config = AutoConfig.from_pretrained(model_path) + max_length = getattr(config, "max_position_embeddings", 2048) + + tokenizer = AutoTokenizer.from_pretrained(model_path) + + truncated_prompts = [] + for prompt in prompts: + tokens = tokenizer(prompt, return_tensors="pt", truncation=False) + if len(tokens.input_ids[0]) > max_length: + truncated_text = tokenizer.decode( + tokens.input_ids[0][: max_length - 1], skip_special_tokens=True + ) + truncated_prompts.append(truncated_text) + else: + truncated_prompts.append(prompt) + return truncated_prompts + + def assert_close_prefill_logits( + self, + prompts, + model_path, + tp_size, + torch_dtype, + prefill_tolerance, + matryoshka_dim: Optional[int] = None, + ) -> None: + truncated_prompts = self._truncate_prompts(prompts, model_path) + + with HFRunner( + model_path, + torch_dtype=torch_dtype, + model_type="embedding", + matryoshka_dim=matryoshka_dim, + ) as hf_runner: + hf_outputs = hf_runner.forward(truncated_prompts) + + attention_backend = "ascend" + with SRTRunner( + model_path, + tp_size=tp_size, + torch_dtype=torch_dtype, + model_type="embedding", + attention_backend=attention_backend, + json_model_override_args=( + {"matryoshka_dimensions": [matryoshka_dim]} if matryoshka_dim else None + ), + ) as srt_runner: + srt_outputs = srt_runner.forward( + truncated_prompts, dimensions=matryoshka_dim + ) + + for i in range(len(prompts)): + hf_logits = torch.Tensor(hf_outputs.embed_logits[i]) + srt_logits = torch.Tensor(srt_outputs.embed_logits[i]) + + similarity = torch.tensor(get_similarities(hf_logits, srt_logits)) + print("similarity diff", abs(similarity - 1)) + + if len(prompts[i]) <= 1000: + assert torch.all( + abs(similarity - 1) < prefill_tolerance + ), "embeddings are not all close" + + def test_prefill_logits(self): + models_to_test = MODELS + + for model, tp_size, prefill_tolerance in models_to_test: + for torch_dtype in TORCH_DTYPES: + self.assert_close_prefill_logits( + DEFAULT_PROMPTS, model, tp_size, torch_dtype, prefill_tolerance + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/llm_models/gsm8k_ascend_mixin.py b/test/nightly/ascend/llm_models/gsm8k_ascend_mixin.py new file mode 100644 index 000000000000..bcf3391f3361 --- /dev/null +++ b/test/nightly/ascend/llm_models/gsm8k_ascend_mixin.py @@ -0,0 +1,68 @@ +import os +from abc import ABC +from types import SimpleNamespace + +from sglang.srt.utils import kill_process_tree +from sglang.test.few_shot_gsm8k import run_eval +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + popen_launch_server, +) + + +class GSM8KAscendMixin(ABC): + model = "" + accuracy = 0.00 + other_args = [ + "--trust-remote-code", + "--mem-fraction-static", + "0.8", + "--attention-backend", + "ascend", + "--disable-cuda-graph", + ] + + @classmethod + def setUpClass(cls): + cls.base_url = DEFAULT_URL_FOR_TEST + os.environ["PYTORCH_NPU_ALLOC_CONF"] = "expandable_segments:True" + os.environ["ASCEND_MF_STORE_URL"] = "tcp://127.0.0.1:24666" + os.environ["HCCL_BUFFSIZE"] = "200" + os.environ["SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK"] = "24" + os.environ["USE_VLLM_CUSTOM_ALLREDUCE"] = "1" + os.environ["HCCL_EXEC_TIMEOUT"] = "200" + os.environ["STREAMS_PER_DEVICE"] = "32" + os.environ["SGLANG_ENBLE_TORCH_COMILE"] = "1" + os.environ["AUTO_USE_UC_MEMORY"] = "0" + os.environ["P2P_HCCL_BUFFSIZE"] = "20" + env = os.environ.copy() + + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=cls.other_args, + env=env, + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval(args) + self.assertGreater( + metrics["accuracy"], + self.accuracy, + f'Accuracy of {self.model} is {str(metrics["accuracy"])}, is lower than {self.accuracy}', + ) diff --git a/test/nightly/ascend/llm_models/test_ascend_afm_4_5b.py b/test/nightly/ascend/llm_models/test_ascend_afm_4_5b.py new file mode 100644 index 000000000000..78dddbdd20cd --- /dev/null +++ b/test/nightly/ascend/llm_models/test_ascend_afm_4_5b.py @@ -0,0 +1,17 @@ +import unittest + +from gsm8k_ascend_mixin import GSM8KAscendMixin + +from sglang.test.ci.ci_register import register_npu_ci +from sglang.test.test_utils import CustomTestCase + +register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True) + + +class TestMistral7B(GSM8KAscendMixin, CustomTestCase): + model = "/root/.cache/modelscope/hub/models/arcee-ai/AFM-4.5B-Base" + accuracy = 0.00 + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/llm_models/test_ascend_baichuan2_13b_chat.py b/test/nightly/ascend/llm_models/test_ascend_baichuan2_13b_chat.py new file mode 100644 index 000000000000..c02c08378f5c --- /dev/null +++ b/test/nightly/ascend/llm_models/test_ascend_baichuan2_13b_chat.py @@ -0,0 +1,22 @@ +import unittest + +from gsm8k_ascend_mixin import GSM8KAscendMixin + +from sglang.test.ci.ci_register import register_npu_ci +from sglang.test.test_utils import CustomTestCase + +register_npu_ci( + est_time=400, + suite="nightly-1-npu-a3", + nightly=True, + disabled="The accuracy test result is 0.", +) + + +class TestMistral7B(GSM8KAscendMixin, CustomTestCase): + model = "/root/.cache/modelscope/hub/models/baichuan-inc/Baichuan2-13B-Chat" + accuracy = 0.00 + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/llm_models/test_ascend_c4ai_command_r_v01.py b/test/nightly/ascend/llm_models/test_ascend_c4ai_command_r_v01.py new file mode 100644 index 000000000000..5adb892fc239 --- /dev/null +++ b/test/nightly/ascend/llm_models/test_ascend_c4ai_command_r_v01.py @@ -0,0 +1,91 @@ +import os +import unittest +from types import SimpleNamespace + +from sglang.srt.utils import kill_process_tree +from sglang.test.ci.ci_register import register_npu_ci +from sglang.test.few_shot_gsm8k import run_eval +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + popen_launch_server, +) + +register_npu_ci( + est_time=400, + suite="nightly-2-npu-a3", + nightly=True, + disabled="The accuracy test result is 0.", +) + + +class TestC4AI(CustomTestCase): + model = "/root/.cache/modelscope/hub/models/CohereForAI/c4ai-command-r-v01" + accuracy = 0.05 + + @classmethod + def setUpClass(cls): + cls.base_url = DEFAULT_URL_FOR_TEST + chat_template_path = "/__w/sglang/sglang/test/nightly/ascend/llm_models/tool_chat_template_c4ai_command_r_v01.jinja" + + other_args = [ + "--trust-remote-code", + "--mem-fraction-static", + "0.8", + "--attention-backend", + "ascend", + "--disable-cuda-graph", + "--chat-template", + chat_template_path, + "--tp-size", + "2", + "--dtype", + "bfloat16", + ] + env = os.environ.copy() + env.update( + { + "PYTORCH_NPU_ALLOC_CONF": "expandable_segments:True", + "ASCEND_MF_STORE_URL": "tcp://127.0.0.1:24666", + "HCCL_BUFFSIZE": "200", + "SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK": "24", + "USE_VLLM_CUSTOM_ALLREDUCE": "1", + "HCCL_EXEC_TIMEOUT": "200", + "STREAMS_PER_DEVICE": "32", + "SGLANG_ENABLE_TORCH_COMPILE": "1", + } + ) + + cls.process = popen_launch_server( + cls.model, + cls.base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_args, + env=env, + ) + + @classmethod + def tearDownClass(cls): + kill_process_tree(cls.process.pid) + + def test_gsm8k(self): + args = SimpleNamespace( + num_shots=5, + data_path=None, + num_questions=200, + max_new_tokens=512, + parallel=128, + host="http://127.0.0.1", + port=int(self.base_url.split(":")[-1]), + ) + metrics = run_eval(args) + self.assertGreater( + metrics["accuracy"], + self.accuracy, + f'Accuracy of {self.model} is {str(metrics["accuracy"])}, is lower than {self.accuracy}', + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/llm_models/test_ascend_charglm2_6b.py b/test/nightly/ascend/llm_models/test_ascend_charglm2_6b.py new file mode 100644 index 000000000000..2a7760d6cba5 --- /dev/null +++ b/test/nightly/ascend/llm_models/test_ascend_charglm2_6b.py @@ -0,0 +1,27 @@ +import unittest + +from gsm8k_ascend_mixin import GSM8KAscendMixin + +from sglang.test.ci.ci_register import register_npu_ci +from sglang.test.test_utils import CustomTestCase + +register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True) + + +class TestMistral7B(GSM8KAscendMixin, CustomTestCase): + model = "/root/.cache/modelscope/hub/models/ZhipuAI/chatglm2-6b" + accuracy = 0.25 + other_args = [ + "--trust-remote-code", + "--mem-fraction-static", + "0.8", + "--attention-backend", + "ascend", + "--disable-cuda-graph", + "--dtype", + "bfloat16", + ] + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/llm_models/test_ascend_exaone_3.py b/test/nightly/ascend/llm_models/test_ascend_exaone_3.py new file mode 100644 index 000000000000..133ad709f6b1 --- /dev/null +++ b/test/nightly/ascend/llm_models/test_ascend_exaone_3.py @@ -0,0 +1,27 @@ +import unittest + +from gsm8k_ascend_mixin import GSM8KAscendMixin + +from sglang.test.ci.ci_register import register_npu_ci +from sglang.test.test_utils import CustomTestCase + +register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True) + + +class TestMistral7B(GSM8KAscendMixin, CustomTestCase): + model = "/root/.cache/modelscope/hub/models/LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct" + accuracy = 0.00 + other_args = [ + "--trust-remote-code", + "--mem-fraction-static", + "0.8", + "--attention-backend", + "ascend", + "--disable-cuda-graph", + "--dtype", + "bfloat16", + ] + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/llm_models/test_ascend_gemma_3_1b_it.py b/test/nightly/ascend/llm_models/test_ascend_gemma_3_1b_it.py new file mode 100644 index 000000000000..f4789090ec9b --- /dev/null +++ b/test/nightly/ascend/llm_models/test_ascend_gemma_3_1b_it.py @@ -0,0 +1,22 @@ +import unittest + +from gsm8k_ascend_mixin import GSM8KAscendMixin + +from sglang.test.ci.ci_register import register_npu_ci +from sglang.test.test_utils import CustomTestCase + +register_npu_ci( + est_time=400, + suite="nightly-1-npu-a3", + nightly=True, + disabled="The accuracy test result is 0.", +) + + +class TestMistral7B(GSM8KAscendMixin, CustomTestCase): + model = "/root/.cache/modelscope/hub/models/LLM-Research/gemma-3-1b-it" + accuracy = 0.00 + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/llm_models/test_ascend_glm4_9b_chat.py b/test/nightly/ascend/llm_models/test_ascend_glm4_9b_chat.py new file mode 100644 index 000000000000..d3f4c6f70410 --- /dev/null +++ b/test/nightly/ascend/llm_models/test_ascend_glm4_9b_chat.py @@ -0,0 +1,17 @@ +import unittest + +from gsm8k_ascend_mixin import GSM8KAscendMixin + +from sglang.test.ci.ci_register import register_npu_ci +from sglang.test.test_utils import CustomTestCase + +register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True) + + +class TestGLM49BChat(GSM8KAscendMixin, CustomTestCase): + model = "/root/.cache/modelscope/hub/models/ZhipuAI/glm-4-9b-chat" + accuracy = 0.00 + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/llm_models/test_ascend_granite_3_0_3b_a800m.py b/test/nightly/ascend/llm_models/test_ascend_granite_3_0_3b_a800m.py new file mode 100644 index 000000000000..759691fc77a8 --- /dev/null +++ b/test/nightly/ascend/llm_models/test_ascend_granite_3_0_3b_a800m.py @@ -0,0 +1,19 @@ +import unittest + +from gsm8k_ascend_mixin import GSM8KAscendMixin + +from sglang.test.ci.ci_register import register_npu_ci +from sglang.test.test_utils import CustomTestCase + +register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True) + + +class TestMistral7B(GSM8KAscendMixin, CustomTestCase): + model = ( + "/root/.cache/modelscope/hub/models/ibm-granite/granite-3.0-3b-a800m-instruct" + ) + accuracy = 0.00 + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/llm_models/test_ascend_granite_3_1_8b.py b/test/nightly/ascend/llm_models/test_ascend_granite_3_1_8b.py new file mode 100644 index 000000000000..3e487b83c99e --- /dev/null +++ b/test/nightly/ascend/llm_models/test_ascend_granite_3_1_8b.py @@ -0,0 +1,17 @@ +import unittest + +from gsm8k_ascend_mixin import GSM8KAscendMixin + +from sglang.test.ci.ci_register import register_npu_ci +from sglang.test.test_utils import CustomTestCase + +register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True) + + +class TestMistral7B(GSM8KAscendMixin, CustomTestCase): + model = "/root/.cache/modelscope/hub/models/ibm-granite/granite-3.1-8b-instruct" + accuracy = 0.695 + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/llm_models/test_ascend_internlm2_7b.py b/test/nightly/ascend/llm_models/test_ascend_internlm2_7b.py new file mode 100644 index 000000000000..be1dca99f6bb --- /dev/null +++ b/test/nightly/ascend/llm_models/test_ascend_internlm2_7b.py @@ -0,0 +1,17 @@ +import unittest + +from gsm8k_ascend_mixin import GSM8KAscendMixin + +from sglang.test.ci.ci_register import register_npu_ci +from sglang.test.test_utils import CustomTestCase + +register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True) + + +class TestMistral7B(GSM8KAscendMixin, CustomTestCase): + model = "/root/.cache/modelscope/hub/models/Shanghai_AI_Laboratory/internlm2-7b" + accuracy = 0.6 + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/llm_models/test_ascend_ling_lite.py b/test/nightly/ascend/llm_models/test_ascend_ling_lite.py new file mode 100644 index 000000000000..c2af52fc9359 --- /dev/null +++ b/test/nightly/ascend/llm_models/test_ascend_ling_lite.py @@ -0,0 +1,17 @@ +import unittest + +from gsm8k_ascend_mixin import GSM8KAscendMixin + +from sglang.test.ci.ci_register import register_npu_ci +from sglang.test.test_utils import CustomTestCase + +register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True) + + +class TestMistral7B(GSM8KAscendMixin, CustomTestCase): + model = "/root/.cache/modelscope/hub/models/inclusionAI/Ling-lite" + accuracy = 0.75 + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/llm_models/test_ascend_llama_2_7b.py b/test/nightly/ascend/llm_models/test_ascend_llama_2_7b.py new file mode 100644 index 000000000000..97fefada6d97 --- /dev/null +++ b/test/nightly/ascend/llm_models/test_ascend_llama_2_7b.py @@ -0,0 +1,17 @@ +import unittest + +from gsm8k_ascend_mixin import GSM8KAscendMixin + +from sglang.test.ci.ci_register import register_npu_ci +from sglang.test.test_utils import CustomTestCase + +register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True) + + +class TestMistral7B(GSM8KAscendMixin, CustomTestCase): + model = "/root/.cache/modelscope/hub/models/LLM-Research/Llama-2-7B" + accuracy = 0.18 + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/llm_models/test_ascend_mimo_7b_rl.py b/test/nightly/ascend/llm_models/test_ascend_mimo_7b_rl.py new file mode 100644 index 000000000000..c025523cc731 --- /dev/null +++ b/test/nightly/ascend/llm_models/test_ascend_mimo_7b_rl.py @@ -0,0 +1,17 @@ +import unittest + +from gsm8k_ascend_mixin import GSM8KAscendMixin + +from sglang.test.ci.ci_register import register_npu_ci +from sglang.test.test_utils import CustomTestCase + +register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True) + + +class TestMistral7B(GSM8KAscendMixin, CustomTestCase): + model = "/root/.cache/modelscope/hub/models/XiaomiMiMo/MiMo-7B-RL" + accuracy = 0.75 + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/llm_models/test_ascend_mistral_7b.py b/test/nightly/ascend/llm_models/test_ascend_mistral_7b.py new file mode 100644 index 000000000000..072f86474d64 --- /dev/null +++ b/test/nightly/ascend/llm_models/test_ascend_mistral_7b.py @@ -0,0 +1,17 @@ +import unittest + +from gsm8k_ascend_mixin import GSM8KAscendMixin + +from sglang.test.ci.ci_register import register_npu_ci +from sglang.test.test_utils import CustomTestCase + +register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True) + + +class TestMistral7B(GSM8KAscendMixin, CustomTestCase): + model = "/root/.cache/modelscope/hub/models/mistralai/Mistral-7B-Instruct-v0.2" + accuracy = 0.375 + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/llm_models/test_ascend_persimmon_8b_chat.py b/test/nightly/ascend/llm_models/test_ascend_persimmon_8b_chat.py new file mode 100644 index 000000000000..174f14dfafb8 --- /dev/null +++ b/test/nightly/ascend/llm_models/test_ascend_persimmon_8b_chat.py @@ -0,0 +1,17 @@ +import unittest + +from gsm8k_ascend_mixin import GSM8KAscendMixin + +from sglang.test.ci.ci_register import register_npu_ci +from sglang.test.test_utils import CustomTestCase + +register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True) + + +class TestMistral7B(GSM8KAscendMixin, CustomTestCase): + model = "/root/.cache/modelscope/hub/models/Howeee/persimmon-8b-chat" + accuracy = 0.17 + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/llm_models/test_ascend_phi_4_multimodal.py b/test/nightly/ascend/llm_models/test_ascend_phi_4_multimodal.py new file mode 100644 index 000000000000..d4ebeca485ea --- /dev/null +++ b/test/nightly/ascend/llm_models/test_ascend_phi_4_multimodal.py @@ -0,0 +1,17 @@ +import unittest + +from gsm8k_ascend_mixin import GSM8KAscendMixin + +from sglang.test.ci.ci_register import register_npu_ci +from sglang.test.test_utils import CustomTestCase + +register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True) + + +class TestMistral7B(GSM8KAscendMixin, CustomTestCase): + model = "/root/.cache/modelscope/hub/models/LLM-Research/Phi-4-multimodal-instruct" + accuracy = 0.8 + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/llm_models/test_ascend_smollm_1_7b.py b/test/nightly/ascend/llm_models/test_ascend_smollm_1_7b.py new file mode 100644 index 000000000000..652934b4ac9f --- /dev/null +++ b/test/nightly/ascend/llm_models/test_ascend_smollm_1_7b.py @@ -0,0 +1,27 @@ +import unittest + +from gsm8k_ascend_mixin import GSM8KAscendMixin + +from sglang.test.ci.ci_register import register_npu_ci +from sglang.test.test_utils import CustomTestCase + +register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True) + + +class TestMistral7B(GSM8KAscendMixin, CustomTestCase): + model = "/root/.cache/modelscope/hub/models/HuggingFaceTB/SmolLM-1.7B" + accuracy = 0.05 + other_args = [ + "--trust-remote-code", + "--mem-fraction-static", + "0.8", + "--attention-backend", + "ascend", + "--disable-cuda-graph", + "--dtype", + "bfloat16", + ] + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/llm_models/tool_chat_template_c4ai_command_r_v01.jinja b/test/nightly/ascend/llm_models/tool_chat_template_c4ai_command_r_v01.jinja new file mode 100644 index 000000000000..638ce5ef2fb9 --- /dev/null +++ b/test/nightly/ascend/llm_models/tool_chat_template_c4ai_command_r_v01.jinja @@ -0,0 +1 @@ +{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %} diff --git a/test/nightly/ascend/rerank_models/test_ascend_cross_encoder_models.py b/test/nightly/ascend/rerank_models/test_ascend_cross_encoder_models.py new file mode 100644 index 000000000000..8593bf367cc8 --- /dev/null +++ b/test/nightly/ascend/rerank_models/test_ascend_cross_encoder_models.py @@ -0,0 +1,92 @@ +import multiprocessing as mp +import unittest + +import torch + +from sglang.test.ci.ci_register import register_npu_ci +from sglang.test.runners import TEST_RERANK_QUERY_DOCS, HFRunner, SRTRunner +from sglang.test.test_utils import CustomTestCase + +register_npu_ci( + est_time=400, + suite="nightly-1-npu-a3", + nightly=True, + disabled="cross encoder scores are not all close", +) + +MODELS = [ + ("/root/.cache/modelscope/hub/models/BAAI/bge-reranker-v2-m3", 1, 1e-2), +] +ATTENTION_BACKEND = ["ascend"] +TORCH_DTYPES = [torch.bfloat16] + + +class TestCrossEncoderModels(CustomTestCase): + + @classmethod + def setUpClass(cls): + mp.set_start_method("spawn", force=True) + + def assert_close_prefill_logits( + self, + prompts, + model_path, + tp_size, + torch_dtype, + score_tolerance, + attention_backend, + ) -> None: + with HFRunner( + model_path, + torch_dtype=torch_dtype, + model_type="cross_encoder", + ) as hf_runner: + hf_scores = hf_runner.forward(prompts).scores + + with SRTRunner( + model_path, + tp_size=tp_size, + torch_dtype=torch_dtype, + model_type="cross_encoder", + attention_backend=attention_backend, + chunked_prefill_size=-1, + disable_radix_cache=True, + ) as srt_runner: + srt_scores = srt_runner.forward(prompts).scores + + for i in range(len(srt_scores)): + score_difference = abs(hf_scores[i] - srt_scores[i]) + + assert ( + score_difference < score_tolerance + ), "cross encoder scores are not all close" + + def preprocess_prompts(self, prompt): + processed_prompts = [] + query = prompt["query"] + documents = prompt["documents"] + for document in documents: + processed_prompts.append([query, document]) + + return processed_prompts + + def test_prefill_logits(self): + models_to_test = MODELS + + for model, tp_size, prefill_tolerance in models_to_test: + for attention_backend in ATTENTION_BACKEND: + for queryDocs in TEST_RERANK_QUERY_DOCS: + prompts = self.preprocess_prompts(queryDocs) + for torch_dtype in TORCH_DTYPES: + self.assert_close_prefill_logits( + prompts, + model, + tp_size, + torch_dtype, + prefill_tolerance, + attention_backend, + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/vlm_models/mmmu-val.yaml b/test/nightly/ascend/vlm_models/mmmu-val.yaml new file mode 100644 index 000000000000..e63c76e08a40 --- /dev/null +++ b/test/nightly/ascend/vlm_models/mmmu-val.yaml @@ -0,0 +1 @@ +dataset_path: /root/.cache/huggingface/hub/datasets--lmms-lab--MMMU/snapshots/364f2e2eb107b36e07ff4c5a15f5947a759cef47 diff --git a/test/nightly/ascend/vlm_models/test_ascend_gemma_3_4b_it.py b/test/nightly/ascend/vlm_models/test_ascend_gemma_3_4b_it.py new file mode 100644 index 000000000000..ee68f4b4df0d --- /dev/null +++ b/test/nightly/ascend/vlm_models/test_ascend_gemma_3_4b_it.py @@ -0,0 +1,19 @@ +import unittest + +from test_vlm_utils import TestVLMModels + +from sglang.test.ci.ci_register import register_npu_ci + +register_npu_ci(est_time=400, suite="nightly-4-npu-a3", nightly=True) + + +class TestGemmaModels(TestVLMModels): + model = "/root/.cache/modelscope/hub/models/google/gemma-3-4b-it" + mmmu_accuracy = 0.2 + + def test_vlm_mmmu_benchmark(self): + self._run_vlm_mmmu_test() + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/vlm_models/test_ascend_janus_pro_1b.py b/test/nightly/ascend/vlm_models/test_ascend_janus_pro_1b.py new file mode 100644 index 000000000000..da6c4d13d74a --- /dev/null +++ b/test/nightly/ascend/vlm_models/test_ascend_janus_pro_1b.py @@ -0,0 +1,19 @@ +import unittest + +from test_vlm_utils import TestVLMModels + +from sglang.test.ci.ci_register import register_npu_ci + +register_npu_ci(est_time=400, suite="nightly-4-npu-a3", nightly=True) + + +class TestGemmaModels(TestVLMModels): + model = "/root/.cache/modelscope/hub/models/deepseek-ai/Janus-Pro-1B" + mmmu_accuracy = 0.2 + + def test_vlm_mmmu_benchmark(self): + self._run_vlm_mmmu_test() + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/vlm_models/test_ascend_janus_pro_7b.py b/test/nightly/ascend/vlm_models/test_ascend_janus_pro_7b.py new file mode 100644 index 000000000000..7e85b4f30b22 --- /dev/null +++ b/test/nightly/ascend/vlm_models/test_ascend_janus_pro_7b.py @@ -0,0 +1,19 @@ +import unittest + +from test_vlm_utils import TestVLMModels + +from sglang.test.ci.ci_register import register_npu_ci + +register_npu_ci(est_time=400, suite="nightly-4-npu-a3", nightly=True) + + +class TestJanusPro7B(TestVLMModels): + model = "/root/.cache/modelscope/hub/models/deepseek-ai/Janus-Pro-7B" + mmmu_accuracy = 0.2 + + def test_vlm_mmmu_benchmark(self): + self._run_vlm_mmmu_test() + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/vlm_models/test_ascend_mimo_vl_7b_rl.py b/test/nightly/ascend/vlm_models/test_ascend_mimo_vl_7b_rl.py new file mode 100644 index 000000000000..fccc5bb4105a --- /dev/null +++ b/test/nightly/ascend/vlm_models/test_ascend_mimo_vl_7b_rl.py @@ -0,0 +1,19 @@ +import unittest + +from test_vlm_utils import TestVLMModels + +from sglang.test.ci.ci_register import register_npu_ci + +register_npu_ci(est_time=400, suite="nightly-4-npu-a3", nightly=True) + + +class TestGemmaModels(TestVLMModels): + model = "/root/.cache/modelscope/hub/models/XiaomiMiMo/MiMo-VL-7B-RL" + mmmu_accuracy = 0.2 + + def test_vlm_mmmu_benchmark(self): + self._run_vlm_mmmu_test() + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/vlm_models/test_ascend_minicpm_o_2_6.py b/test/nightly/ascend/vlm_models/test_ascend_minicpm_o_2_6.py new file mode 100644 index 000000000000..048669280347 --- /dev/null +++ b/test/nightly/ascend/vlm_models/test_ascend_minicpm_o_2_6.py @@ -0,0 +1,19 @@ +import unittest + +from test_vlm_utils import TestVLMModels + +from sglang.test.ci.ci_register import register_npu_ci + +register_npu_ci(est_time=400, suite="nightly-4-npu-a3", nightly=True) + + +class TestGemmaModels(TestVLMModels): + model = "/root/.cache/modelscope/hub/models/openbmb/MiniCPM-o-2_6" + mmmu_accuracy = 0.2 + + def test_vlm_mmmu_benchmark(self): + self._run_vlm_mmmu_test() + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/vlm_models/test_ascend_minicpm_v_2_6.py b/test/nightly/ascend/vlm_models/test_ascend_minicpm_v_2_6.py new file mode 100644 index 000000000000..3ed6fb15fec4 --- /dev/null +++ b/test/nightly/ascend/vlm_models/test_ascend_minicpm_v_2_6.py @@ -0,0 +1,19 @@ +import unittest + +from test_vlm_utils import TestVLMModels + +from sglang.test.ci.ci_register import register_npu_ci + +register_npu_ci(est_time=400, suite="nightly-4-npu-a3", nightly=True) + + +class TestGemmaModels(TestVLMModels): + model = "/root/.cache/modelscope/hub/models/openbmb/MiniCPM-V-2_6" + mmmu_accuracy = 0.2 + + def test_vlm_mmmu_benchmark(self): + self._run_vlm_mmmu_test() + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/vlm_models/test_ascend_phi4_multimodal_instruct.py b/test/nightly/ascend/vlm_models/test_ascend_phi4_multimodal_instruct.py new file mode 100644 index 000000000000..191b541a6760 --- /dev/null +++ b/test/nightly/ascend/vlm_models/test_ascend_phi4_multimodal_instruct.py @@ -0,0 +1,19 @@ +import unittest + +from test_vlm_utils import TestVLMModels + +from sglang.test.ci.ci_register import register_npu_ci + +register_npu_ci(est_time=400, suite="nightly-4-npu-a3", nightly=True) + + +class TestGemmaModels(TestVLMModels): + model = "/root/.cache/modelscope/hub/models/microsoft/Phi-4-multimodal-instruct" + mmmu_accuracy = 0.2 + + def test_vlm_mmmu_benchmark(self): + self._run_vlm_mmmu_test() + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/vlm_models/test_ascend_qwen2_5_vl_3b_instruct.py b/test/nightly/ascend/vlm_models/test_ascend_qwen2_5_vl_3b_instruct.py new file mode 100644 index 000000000000..dda18933bcf8 --- /dev/null +++ b/test/nightly/ascend/vlm_models/test_ascend_qwen2_5_vl_3b_instruct.py @@ -0,0 +1,19 @@ +import unittest + +from test_vlm_utils import TestVLMModels + +from sglang.test.ci.ci_register import register_npu_ci + +register_npu_ci(est_time=400, suite="nightly-4-npu-a3", nightly=True) + + +class TestGemmaModels(TestVLMModels): + model = "/root/.cache/modelscope/hub/models/Qwen/Qwen2.5-VL-3B-Instruct" + mmmu_accuracy = 0.2 + + def test_vlm_mmmu_benchmark(self): + self._run_vlm_mmmu_test() + + +if __name__ == "__main__": + unittest.main() diff --git a/test/nightly/ascend/vlm_models/test_vlm_utils.py b/test/nightly/ascend/vlm_models/test_vlm_utils.py new file mode 100644 index 000000000000..6c5eae64b2dd --- /dev/null +++ b/test/nightly/ascend/vlm_models/test_vlm_utils.py @@ -0,0 +1,217 @@ +import glob +import json +import os +import subprocess + +from sglang.srt.utils import kill_process_tree +from sglang.test.test_utils import ( + DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + DEFAULT_URL_FOR_TEST, + CustomTestCase, + popen_launch_server, +) + + +class TestVLMModels(CustomTestCase): + model = "" + mmmu_accuracy = 0.00 + other_args = [ + "--trust-remote-code", + "--cuda-graph-max-bs", + "32", + "--enable-multimodal", + "--mem-fraction-static", + 0.35, + "--log-level", + "info", + "--attention-backend", + "ascend", + "--disable-cuda-graph", + "--tp-size", + 4, + ] + + @classmethod + def setUpClass(cls): + # Removed argument parsing from here + cls.base_url = DEFAULT_URL_FOR_TEST + cls.api_key = "sk-123456" + cls.time_out = DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH + + # Set OpenAI API key and base URL environment variables. Needed for lmm-evals to work. + os.environ["OPENAI_API_KEY"] = cls.api_key + os.environ["OPENAI_API_BASE"] = f"{cls.base_url}/v1" + + def run_mmmu_eval( + self, + model_version: str, + output_path: str, + limit: str, + *, + env: dict | None = None, + ): + """ + Evaluate a VLM on the MMMU validation set with lmms‑eval. + Only `model_version` (checkpoint) and `chat_template` vary; + We are focusing only on the validation set due to resource constraints. + """ + # -------- fixed settings -------- + model = "openai_compatible" + tp = 1 + tasks = "mmmu_val" + batch_size = 2 + log_suffix = "openai_compatible" + os.makedirs(output_path, exist_ok=True) + + # -------- compose --model_args -------- + model_args = f'model_version="{model_version}",' f"tp={tp}" + + # -------- build command list -------- + cmd = [ + "python3", + "-m", + "lmms_eval", + "--model", + model, + "--model_args", + model_args, + "--tasks", + tasks, + "--batch_size", + str(batch_size), + "--log_samples", + "--log_samples_suffix", + log_suffix, + "--output_path", + str(output_path), + "--limit", + limit, + "--config", + "/__w/sglang/sglang/test/nightly/ascend/vlm_models/mmmu-val.yaml", + ] + + subprocess.run( + cmd, + check=True, + timeout=3600, + ) + + def _run_vlm_mmmu_test( + self, + output_path="./logs", + test_name="", + custom_env=None, + capture_output=False, + limit="50", + ): + """ + Common method to run VLM MMMU benchmark test. + Args: + model: Model to test + output_path: Path for output logs + test_name: Optional test name for logging + custom_env: Optional custom environment variables + capture_output: Whether to capture server stdout/stderr + """ + print(f"\nTesting model: {self.model}{test_name}") + + process = None + server_output = "" + + try: + # Prepare environment variables + process_env = os.environ.copy() + if custom_env: + process_env.update(custom_env) + + # Prepare stdout/stderr redirection if needed + stdout_file = None + stderr_file = None + if capture_output: + stdout_file = open("/tmp/server_stdout.log", "w") + stderr_file = open("/tmp/server_stderr.log", "w") + + process = popen_launch_server( + self.model, + base_url=self.base_url, + timeout=self.time_out, + api_key=self.api_key, + other_args=self.other_args, + env=process_env, + return_stdout_stderr=( + (stdout_file, stderr_file) if capture_output else None + ), + ) + + # Run evaluation + self.run_mmmu_eval(self.model, output_path, limit) + + # Get the result file + result_file_path = glob.glob(f"{output_path}/*.json")[0] + + with open(result_file_path, "r") as f: + result = json.load(f) + print(f"Result{test_name}\n: {result}") + + # Process the result + mmmu_accuracy = result["results"]["mmmu_val"]["mmmu_acc,none"] + print( + f"Model {self.model} achieved accuracy{test_name}: {mmmu_accuracy:.4f}" + ) + + # Capture server output if requested + if capture_output and process: + server_output = self._read_output_from_files() + + # Assert performance meets expected threshold + self.assertGreaterEqual( + mmmu_accuracy, + self.mmmu_accuracy, + f"Model {self.model} accuracy ({mmmu_accuracy:.4f}) below expected threshold ({self.mmmu_accuracy:.4f}){test_name}", + ) + + return server_output + + except Exception as e: + print(f"Error testing {self.model}{test_name}: {e}") + self.fail(f"Test failed for {self.model}{test_name}: {e}") + + finally: + # Ensure process cleanup happens regardless of success/failure + if process is not None and process.poll() is None: + print(f"Cleaning up process {process.pid}") + try: + kill_process_tree(process.pid) + except Exception as e: + print(f"Error killing process: {e}") + + # clean up temporary files + if capture_output: + if stdout_file: + stdout_file.close() + if stderr_file: + stderr_file.close() + for filename in ["/tmp/server_stdout.log", "/tmp/server_stderr.log"]: + try: + if os.path.exists(filename): + os.remove(filename) + except Exception as e: + print(f"Error removing {filename}: {e}") + + def _read_output_from_files(self): + output_lines = [] + + log_files = [ + ("/tmp/server_stdout.log", "[STDOUT]"), + ("/tmp/server_stderr.log", "[STDERR]"), + ] + for filename, tag in log_files: + try: + if os.path.exists(filename): + with open(filename, "r") as f: + for line in f: + output_lines.append(f"{tag} {line.rstrip()}") + except Exception as e: + print(f"Error reading {tag.lower()} file: {e}") + + return "\n".join(output_lines) diff --git a/test/run_suite.py b/test/run_suite.py index c903f1ab23bf..461a30aa0736 100644 --- a/test/run_suite.py +++ b/test/run_suite.py @@ -10,6 +10,7 @@ "cpu": HWBackend.CPU, "cuda": HWBackend.CUDA, "amd": HWBackend.AMD, + "npu": HWBackend.NPU, } # Per-commit test suites (run on every PR) @@ -17,6 +18,7 @@ HWBackend.CPU: ["default"], HWBackend.AMD: ["stage-a-test-1"], HWBackend.CUDA: ["stage-a-test-1"], + HWBackend.NPU: [], } # Nightly test suites (run nightly, organized by GPU configuration) @@ -33,6 +35,12 @@ ], HWBackend.AMD: ["nightly-amd"], HWBackend.CPU: [], + HWBackend.NPU: [ + "nightly-1-npu-a3", + "nightly-2-npu-a3", + "nightly-4-npu-a3", + "nightly-16-npu-a3", + ], }