Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1727,6 +1727,29 @@ dsv4-fp4-b200-vllm:
- { tp: 8, conc-start: 1, conc-end: 32 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 }

# DeepSeek-V4-Pro TRTLLM bring-up. This uses a TensorRT-LLM image built from
# NVIDIA/TensorRT-LLM@feat/deepseek_v4; the benchmark script keeps a guarded
# source-build fallback if the image is missing the required DSv4 support.
dsv4-fp4-b200-trt:
image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b200-dsv4
precision: fp4
framework: trt
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 32 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 32 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 }

# MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds
# --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
dsv4-fp4-b200-vllm-mtp:
Expand Down Expand Up @@ -2565,6 +2588,24 @@ dsv4-fp4-b300-vllm:
- { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }

dsv4-fp4-b300-trt:
image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b300
precision: fp4
framework: trt
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 8 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 8 }

dsv4-fp4-b300-vllm-mtp:
image: vllm/vllm-openai:v0.20.0-cu130
model: deepseek-ai/DeepSeek-V4-Pro
Expand Down
7 changes: 7 additions & 0 deletions .github/workflows/benchmark-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ env:
DISAGG: ${{ inputs.disagg }}
RUN_EVAL: ${{ inputs.run-eval }}
EVAL_ONLY: ${{ inputs.eval-only }}
GHCR_TOKEN: ${{ secrets.GHCR_TOKEN }}
GHCR_USER: ${{ secrets.GHCR_USER || github.actor }}
PYTHONDONTWRITEBYTECODE: '1'
PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache

Expand Down Expand Up @@ -138,6 +140,11 @@ jobs:
rm -f results*.json || true
rm -f sample*.jsonl || true

- name: Cleanup stale benchmark outputs (pre-run)
run: |
rm -f server.log || true
rm -f gpu_metrics.csv || true

- name: Launch job script
env:
RUNNER_NAME: ${{ runner.name }}
Expand Down
12 changes: 11 additions & 1 deletion benchmarks/benchmark_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -165,11 +165,12 @@ wait_for_server_ready() {
}

# Run benchmark serving with standardized parameters
# All parameters are required except --use-chat-template, --dsv4, and --trust-remote-code
# All parameters are required except --endpoint, --use-chat-template, --dsv4, and --trust-remote-code
# Parameters:
# --model: Model name
# --port: Server port
# --backend: Backend type - e.g., 'vllm' or 'openai'
# --endpoint: Optional API endpoint override
# --input-len: Random input sequence length
# --output-len: Random output sequence length
# --random-range-ratio: Random range ratio
Expand All @@ -194,6 +195,7 @@ run_benchmark_serving() {
local model=""
local port=""
local backend=""
local endpoint=""
local input_len=""
local output_len=""
local random_range_ratio=""
Expand Down Expand Up @@ -221,6 +223,10 @@ run_benchmark_serving() {
backend="$2"
shift 2
;;
--endpoint)
endpoint="$2"
shift 2
;;
--input-len)
input_len="$2"
shift 2
Expand Down Expand Up @@ -356,6 +362,10 @@ run_benchmark_serving() {
--result-dir "$result_dir"
--result-filename "$result_filename.json"
)

if [[ -n "$endpoint" ]]; then
benchmark_cmd+=(--endpoint "$endpoint")
fi

# Add --use-chat-template if requested
if [[ "$use_chat_template" == true ]]; then
Expand Down
139 changes: 139 additions & 0 deletions benchmarks/single_node/dsv4_fp4_b200_trt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
#!/usr/bin/env bash

# DeepSeek-V4-Pro single-node TRTLLM bring-up recipe for NVIDIA/TensorRT-LLM
# feat/deepseek_v4. The configured image should already contain this branch;
# bootstrap_trtllm_dsv4 verifies that and only builds the pinned branch as a
# fallback.

source "$(dirname "$0")/../benchmark_lib.sh"
source "$(dirname "$0")/trtllm_dsv4_bootstrap.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
MAX_MODEL_LEN \
RANDOM_RANGE_RATIO \
RESULT_FILENAME \
DP_ATTENTION \
EP_SIZE

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"

export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}"
echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE"

bootstrap_trtllm_dsv4 || exit 1

if [[ "$MODEL" != /* ]]; then
hf download "$MODEL"
fi

nvidia-smi

SERVER_LOG="$PWD/server.log"
PORT=${PORT:-8888}
EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml"

MOE_BACKEND="TRTLLM"
MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}"

ATTENTION_DP_CONFIG=""
if [[ "$DP_ATTENTION" == "true" ]]; then
ATTENTION_DP_CONFIG="
attention_dp_config:
batching_wait_iters: 0
enable_balance: true
timeout_iters: 60"
fi

cat > "$EXTRA_CONFIG_FILE" << EOF
cuda_graph_config:
enable_padding: true
max_batch_size: $CUDA_GRAPH_MAX_BATCH_SIZE
enable_attention_dp: $DP_ATTENTION$ATTENTION_DP_CONFIG
print_iter_log: true
kv_cache_config:
tokens_per_block: 128
dtype: fp8
free_gpu_memory_fraction: $KV_CACHE_FREE_MEM_FRACTION
enable_block_reuse: false
stream_interval: 10
num_postprocess_workers: 4
moe_config:
backend: $MOE_BACKEND
EOF

echo "Generated config file contents:"
cat "$EXTRA_CONFIG_FILE"

MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
MAX_NUM_TOKENS=$(( ISL + OSL + 256 ))
MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
fi

start_gpu_monitor --output "$PWD/gpu_metrics.csv"

set -x
SERVE_CMD=(
trtllm-serve "$MODEL" \
--host 0.0.0.0 \
--port "$PORT" \
--trust_remote_code \
--backend pytorch \
--max_batch_size "$MAX_BATCH_SIZE" \
--max_seq_len "$MAX_MODEL_LEN" \
--max_num_tokens "$MAX_NUM_TOKENS" \
--tp_size "$TP" \
--ep_size "$EP_SIZE" \
--custom_tokenizer deepseek_v4 \
--config "$EXTRA_CONFIG_FILE"
)

if [[ "${TRTLLM_DSV4_USE_MPIRUN:-1}" == "0" ]]; then
"${SERVE_CMD[@]}" > "$SERVER_LOG" 2>&1 &
else
mpirun -n 1 --oversubscribe --allow-run-as-root \
"${SERVE_CMD[@]}" \
> "$SERVER_LOG" 2>&1 &
fi

SERVER_PID=$!

wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend openai-chat \
--endpoint /v1/chat/completions \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$(( CONC * 10 ))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir "$PWD/" \
--trust-remote-code \
--server-pid "$SERVER_PID"

if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

stop_gpu_monitor
set +x
10 changes: 10 additions & 0 deletions benchmarks/single_node/dsv4_fp4_b300_trt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env bash

# B300 uses the same low-concurrency TRTLLM bring-up recipe as B200. The B300
# runner may rewrite MODEL to the pre-staged /data/models/dsv4-pro path before
# this script is invoked. The job itself is already launched under srun/pyxis;
# avoid nested mpirun because this cluster's OpenMPI build lacks Slurm PMIx.

export TRTLLM_DSV4_USE_MPIRUN="${TRTLLM_DSV4_USE_MPIRUN:-0}"

bash "$(dirname "$0")/dsv4_fp4_b200_trt.sh"
113 changes: 113 additions & 0 deletions benchmarks/single_node/trtllm_dsv4_bootstrap.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#!/usr/bin/env bash

# Verify TensorRT-LLM DeepSeek-V4 support and, if needed, build/install the
# pinned feature branch at runtime as a fallback.

trtllm_dsv4_supported() {
python3 - <<'PY'
import importlib
import sys

try:
import tensorrt_llm # noqa: F401
import torch

importlib.import_module("tensorrt_llm._torch.models.modeling_deepseekv4")
importlib.import_module(
"tensorrt_llm._torch.attention_backend.sparse.deepseek_v4.deepseek_v4"
)
getattr(torch.ops.trtllm, "compressor_prefill_reduction")
getattr(torch.ops.trtllm, "compressor_paged_kv_compress")
getattr(torch.ops.trtllm, "compressor_postprocess_scatter")
except Exception as exc:
print(f"TensorRT-LLM DeepSeek-V4 support check failed: {exc}", file=sys.stderr)
raise SystemExit(1)
PY
}

bootstrap_trtllm_dsv4() {
if [[ "${TRTLLM_DSV4_BOOTSTRAP:-auto}" == "0" ]]; then
echo "TRTLLM_DSV4_BOOTSTRAP=0; skipping TensorRT-LLM DeepSeek-V4 bootstrap"
return 0
fi

if [[ "${TRTLLM_DSV4_BOOTSTRAP:-auto}" != "force" ]] && trtllm_dsv4_supported; then
echo "TensorRT-LLM DeepSeek-V4 support already available"
return 0
fi

local repo="${TRTLLM_DSV4_REPO:-https://github.com/NVIDIA/TensorRT-LLM.git}"
local branch="${TRTLLM_DSV4_BRANCH:-feat/deepseek_v4}"
local ref="${TRTLLM_DSV4_REF:-f1c5fe143febb70cd74f0fb4ccca1516206268d7}"
local src="${TRTLLM_DSV4_SRC:-/tmp/trtllm-dsv4-src}"
local build_dir="${TRTLLM_DSV4_BUILD_DIR:-/tmp/trtllm-dsv4-build}"
local dist_dir="${TRTLLM_DSV4_DIST_DIR:-/tmp/trtllm-dsv4-wheel}"
local archs="${TRTLLM_DSV4_CUDA_ARCHITECTURES:-100-real;103-real}"
local lock_file="${TRTLLM_DSV4_LOCK_FILE:-/tmp/trtllm-dsv4-bootstrap.lock}"

echo "Bootstrapping TensorRT-LLM DeepSeek-V4 support"
echo " repo: $repo"
echo " branch: $branch"
echo " ref: $ref"
echo " archs: $archs"

if ! command -v git >/dev/null 2>&1; then
if command -v apt-get >/dev/null 2>&1; then
apt-get update
apt-get install -y git
else
echo "git is required to bootstrap TensorRT-LLM DeepSeek-V4 support" >&2
return 1
fi
fi

(
set -euo pipefail
flock 9

if [[ "${TRTLLM_DSV4_BOOTSTRAP:-auto}" != "force" ]] && trtllm_dsv4_supported; then
echo "TensorRT-LLM DeepSeek-V4 support became available while waiting for bootstrap lock"
exit 0
fi

if [[ ! -d "$src/.git" ]]; then
rm -rf "$src"
git clone \
--filter=blob:none \
--single-branch \
--branch "$branch" \
"$repo" "$src"
fi

cd "$src"
git fetch origin "$branch" --depth 1
git fetch origin "$ref" --depth 1 || true
git checkout "$ref"
git submodule update --init --recursive --depth 1

if command -v git-lfs >/dev/null 2>&1; then
git lfs install --local
git lfs pull
else
echo "git-lfs not found; continuing without LFS pull"
fi

rm -rf "$dist_dir"
mkdir -p "$dist_dir"

# setup.py sanity-checks for the generated bindings/ stubs directory.
# Do not use --skip-stubs here, or wheel packaging fails after C++ build.
python3 scripts/build_wheel.py \
--cuda_architectures "$archs" \
--build_dir "$build_dir" \
--dist_dir "$dist_dir" \
--clean \
${TRTLLM_DSV4_BUILD_ARGS:-}

local wheel
wheel="$(ls -t "$dist_dir"/tensorrt_llm*.whl | head -1)"
python3 -m pip install --force-reinstall --no-deps "$wheel"
) 9>"$lock_file"

trtllm_dsv4_supported
}
8 changes: 8 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2038,3 +2038,11 @@
- updated sglang container image
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1027

- config-keys:
- dsv4-fp4-b200-trt
- dsv4-fp4-b300-trt
description:
- "Add DeepSeek-V4-Pro FP4 TRTLLM single-node bring-up configs on B200 and B300"
- "Use ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1, built from NVIDIA/TensorRT-LLM@feat/deepseek_v4 (f1c5fe1), with a guarded source-build fallback"
- "Serve DeepSeek-V4-Pro through TRTLLM's deepseek_v4 tokenizer and /v1/chat/completions with TP8, EP8, STP-only conc 1-8 for 1k1k and 8k1k"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1233
Loading
Loading