SemiAnalysisAI · Oseltamivir · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026
@@ -1727,6 +1727,29 @@ dsv4-fp4-b200-vllm:
     - { tp: 8, conc-start: 1, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 1024 }
 
+# DeepSeek-V4-Pro TRTLLM bring-up. This uses a TensorRT-LLM image built from
+# NVIDIA/TensorRT-LLM@feat/deepseek_v4; the benchmark script keeps a guarded
+# source-build fallback if the image is missing the required DSv4 support.
+dsv4-fp4-b200-trt:
+  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: b200-dsv4
+  precision: fp4
+  framework: trt
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 8, conc-start: 1, conc-end: 32 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - { tp: 8, conc-start: 1, conc-end: 32 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 32, conc-end: 256 }
+
 # MTP variant of dsv4-fp4-b200-vllm. Mirrors the base search space and adds
 # --speculative-config '{"method":"mtp","num_speculative_tokens":2}'.
 dsv4-fp4-b200-vllm-mtp:
@@ -2565,6 +2588,24 @@ dsv4-fp4-b300-vllm:
     - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
 
+dsv4-fp4-b300-trt:
+  image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: b300
+  precision: fp4
+  framework: trt
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 8 }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 1, conc-end: 8 }
+
 dsv4-fp4-b300-vllm-mtp:
   image: vllm/vllm-openai:v0.20.0-cu130
   model: deepseek-ai/DeepSeek-V4-Pro

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
@@ -89,6 +89,8 @@ env:
   DISAGG: ${{ inputs.disagg }}
   RUN_EVAL: ${{ inputs.run-eval }}
   EVAL_ONLY: ${{ inputs.eval-only }}
+  GHCR_TOKEN: ${{ secrets.GHCR_TOKEN }}
+  GHCR_USER: ${{ secrets.GHCR_USER || github.actor }}
   PYTHONDONTWRITEBYTECODE: '1'
   PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache
 
@@ -138,6 +140,11 @@ jobs:
           rm -f results*.json || true
           rm -f sample*.jsonl || true
 
+      - name: Cleanup stale benchmark outputs (pre-run)
+        run: |
+          rm -f server.log || true
+          rm -f gpu_metrics.csv || true
+
       - name: Launch job script
         env:
           RUNNER_NAME: ${{ runner.name }}

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
@@ -165,11 +165,12 @@ wait_for_server_ready() {
 }
 
 # Run benchmark serving with standardized parameters
-# All parameters are required except --use-chat-template, --dsv4, and --trust-remote-code
+# All parameters are required except --endpoint, --use-chat-template, --dsv4, and --trust-remote-code
 # Parameters:
 #   --model: Model name
 #   --port: Server port
 #   --backend: Backend type - e.g., 'vllm' or 'openai'
+#   --endpoint: Optional API endpoint override
 #   --input-len: Random input sequence length
 #   --output-len: Random output sequence length
 #   --random-range-ratio: Random range ratio
@@ -194,6 +195,7 @@ run_benchmark_serving() {
     local model=""
     local port=""
     local backend=""
+    local endpoint=""
     local input_len=""
     local output_len=""
     local random_range_ratio=""
@@ -221,6 +223,10 @@ run_benchmark_serving() {
                 backend="$2"
                 shift 2
                 ;;
+            --endpoint)
+                endpoint="$2"
+                shift 2
+                ;;
             --input-len)
                 input_len="$2"
                 shift 2
@@ -356,6 +362,10 @@ run_benchmark_serving() {
         --result-dir "$result_dir"
         --result-filename "$result_filename.json"
     )
+
+    if [[ -n "$endpoint" ]]; then
+        benchmark_cmd+=(--endpoint "$endpoint")
+    fi
 
     # Add --use-chat-template if requested
     if [[ "$use_chat_template" == true ]]; then

diff --git a/benchmarks/single_node/dsv4_fp4_b200_trt.sh b/benchmarks/single_node/dsv4_fp4_b200_trt.sh
@@ -0,0 +1,139 @@
+#!/usr/bin/env bash
+
+# DeepSeek-V4-Pro single-node TRTLLM bring-up recipe for NVIDIA/TensorRT-LLM
+# feat/deepseek_v4. The configured image should already contain this branch;
+# bootstrap_trtllm_dsv4 verifies that and only builds the pinned branch as a
+# fallback.
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+source "$(dirname "$0")/trtllm_dsv4_bootstrap.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    MAX_MODEL_LEN \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME \
+    DP_ATTENTION \
+    EP_SIZE
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
+
+export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}"
+echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE"
+
+bootstrap_trtllm_dsv4 || exit 1
+
+if [[ "$MODEL" != /* ]]; then
+    hf download "$MODEL"
+fi
+
+nvidia-smi
+
+SERVER_LOG="$PWD/server.log"
+PORT=${PORT:-8888}
+EXTRA_CONFIG_FILE="dsv4-fp4-trt.yml"
+
+MOE_BACKEND="TRTLLM"
+MAX_BATCH_SIZE=$(( CONC > 16 ? CONC : 16 ))
+CUDA_GRAPH_MAX_BATCH_SIZE="$MAX_BATCH_SIZE"
+KV_CACHE_FREE_MEM_FRACTION="${KV_CACHE_FREE_MEM_FRACTION:-0.50}"
+
+ATTENTION_DP_CONFIG=""
+if [[ "$DP_ATTENTION" == "true" ]]; then
+    ATTENTION_DP_CONFIG="
+attention_dp_config:
+    batching_wait_iters: 0
+    enable_balance: true
+    timeout_iters: 60"
+fi
+
+cat > "$EXTRA_CONFIG_FILE" << EOF
+cuda_graph_config:
+    enable_padding: true
+    max_batch_size: $CUDA_GRAPH_MAX_BATCH_SIZE
+enable_attention_dp: $DP_ATTENTION$ATTENTION_DP_CONFIG
+print_iter_log: true
+kv_cache_config:
+    tokens_per_block: 128
+    dtype: fp8
+    free_gpu_memory_fraction: $KV_CACHE_FREE_MEM_FRACTION
+    enable_block_reuse: false
+stream_interval: 10
+num_postprocess_workers: 4
+moe_config:
+    backend: $MOE_BACKEND
+EOF
+
+echo "Generated config file contents:"
+cat "$EXTRA_CONFIG_FILE"
+
+MAX_MODEL_LEN=$(( MAX_MODEL_LEN > 8192 ? MAX_MODEL_LEN : 8192 ))
+MAX_NUM_TOKENS=$(( ISL + OSL + 256 ))
+MAX_NUM_TOKENS=$(( MAX_NUM_TOKENS > 8192 ? MAX_NUM_TOKENS : 8192 ))
+
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN"
+    MAX_NUM_TOKENS="$EVAL_MAX_MODEL_LEN"
+fi
+
+start_gpu_monitor --output "$PWD/gpu_metrics.csv"
+
+set -x
+SERVE_CMD=(
+    trtllm-serve "$MODEL" \
+    --host 0.0.0.0 \
+    --port "$PORT" \
+    --trust_remote_code \
+    --backend pytorch \
+    --max_batch_size "$MAX_BATCH_SIZE" \
+    --max_seq_len "$MAX_MODEL_LEN" \
+    --max_num_tokens "$MAX_NUM_TOKENS" \
+    --tp_size "$TP" \
+    --ep_size "$EP_SIZE" \
+    --custom_tokenizer deepseek_v4 \
+    --config "$EXTRA_CONFIG_FILE"
+)
+
+if [[ "${TRTLLM_DSV4_USE_MPIRUN:-1}" == "0" ]]; then
+    "${SERVE_CMD[@]}" > "$SERVER_LOG" 2>&1 &
+else
+    mpirun -n 1 --oversubscribe --allow-run-as-root \
+        "${SERVE_CMD[@]}" \
+        > "$SERVER_LOG" 2>&1 &
+fi
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend openai-chat \
+    --endpoint /v1/chat/completions \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$(( CONC * 10 ))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir "$PWD/" \
+    --trust-remote-code \
+    --server-pid "$SERVER_PID"
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+stop_gpu_monitor
+set +x
diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/dsv4_fp4_b300_trt.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+# B300 uses the same low-concurrency TRTLLM bring-up recipe as B200. The B300
+# runner may rewrite MODEL to the pre-staged /data/models/dsv4-pro path before
+# this script is invoked. The job itself is already launched under srun/pyxis;
+# avoid nested mpirun because this cluster's OpenMPI build lacks Slurm PMIx.
+
+export TRTLLM_DSV4_USE_MPIRUN="${TRTLLM_DSV4_USE_MPIRUN:-0}"
+
+bash "$(dirname "$0")/dsv4_fp4_b200_trt.sh"
diff --git a/benchmarks/single_node/trtllm_dsv4_bootstrap.sh b/benchmarks/single_node/trtllm_dsv4_bootstrap.sh
@@ -0,0 +1,113 @@
+#!/usr/bin/env bash
+
+# Verify TensorRT-LLM DeepSeek-V4 support and, if needed, build/install the
+# pinned feature branch at runtime as a fallback.
+
+trtllm_dsv4_supported() {
+    python3 - <<'PY'
+import importlib
+import sys
+
+try:
+    import tensorrt_llm  # noqa: F401
+    import torch
+
+    importlib.import_module("tensorrt_llm._torch.models.modeling_deepseekv4")
+    importlib.import_module(
+        "tensorrt_llm._torch.attention_backend.sparse.deepseek_v4.deepseek_v4"
+    )
+    getattr(torch.ops.trtllm, "compressor_prefill_reduction")
+    getattr(torch.ops.trtllm, "compressor_paged_kv_compress")
+    getattr(torch.ops.trtllm, "compressor_postprocess_scatter")
+except Exception as exc:
+    print(f"TensorRT-LLM DeepSeek-V4 support check failed: {exc}", file=sys.stderr)
+    raise SystemExit(1)
+PY
+}
+
+bootstrap_trtllm_dsv4() {
+    if [[ "${TRTLLM_DSV4_BOOTSTRAP:-auto}" == "0" ]]; then
+        echo "TRTLLM_DSV4_BOOTSTRAP=0; skipping TensorRT-LLM DeepSeek-V4 bootstrap"
+        return 0
+    fi
+
+    if [[ "${TRTLLM_DSV4_BOOTSTRAP:-auto}" != "force" ]] && trtllm_dsv4_supported; then
+        echo "TensorRT-LLM DeepSeek-V4 support already available"
+        return 0
+    fi
+
+    local repo="${TRTLLM_DSV4_REPO:-https://github.com/NVIDIA/TensorRT-LLM.git}"
+    local branch="${TRTLLM_DSV4_BRANCH:-feat/deepseek_v4}"
+    local ref="${TRTLLM_DSV4_REF:-f1c5fe143febb70cd74f0fb4ccca1516206268d7}"
+    local src="${TRTLLM_DSV4_SRC:-/tmp/trtllm-dsv4-src}"
+    local build_dir="${TRTLLM_DSV4_BUILD_DIR:-/tmp/trtllm-dsv4-build}"
+    local dist_dir="${TRTLLM_DSV4_DIST_DIR:-/tmp/trtllm-dsv4-wheel}"
+    local archs="${TRTLLM_DSV4_CUDA_ARCHITECTURES:-100-real;103-real}"
+    local lock_file="${TRTLLM_DSV4_LOCK_FILE:-/tmp/trtllm-dsv4-bootstrap.lock}"
+
+    echo "Bootstrapping TensorRT-LLM DeepSeek-V4 support"
+    echo "  repo:   $repo"
+    echo "  branch: $branch"
+    echo "  ref:    $ref"
+    echo "  archs:  $archs"
+
+    if ! command -v git >/dev/null 2>&1; then
+        if command -v apt-get >/dev/null 2>&1; then
+            apt-get update
+            apt-get install -y git
+        else
+            echo "git is required to bootstrap TensorRT-LLM DeepSeek-V4 support" >&2
+            return 1
+        fi
+    fi
+
+    (
+        set -euo pipefail
+        flock 9
+
+        if [[ "${TRTLLM_DSV4_BOOTSTRAP:-auto}" != "force" ]] && trtllm_dsv4_supported; then
+            echo "TensorRT-LLM DeepSeek-V4 support became available while waiting for bootstrap lock"
+            exit 0
+        fi
+
+        if [[ ! -d "$src/.git" ]]; then
+            rm -rf "$src"
+            git clone \
+                --filter=blob:none \
+                --single-branch \
+                --branch "$branch" \
+                "$repo" "$src"
+        fi
+
+        cd "$src"
+        git fetch origin "$branch" --depth 1
+        git fetch origin "$ref" --depth 1 || true
+        git checkout "$ref"
+        git submodule update --init --recursive --depth 1
+
+        if command -v git-lfs >/dev/null 2>&1; then
+            git lfs install --local
+            git lfs pull
+        else
+            echo "git-lfs not found; continuing without LFS pull"
+        fi
+
+        rm -rf "$dist_dir"
+        mkdir -p "$dist_dir"
+
+        # setup.py sanity-checks for the generated bindings/ stubs directory.
+        # Do not use --skip-stubs here, or wheel packaging fails after C++ build.
+        python3 scripts/build_wheel.py \
+            --cuda_architectures "$archs" \
+            --build_dir "$build_dir" \
+            --dist_dir "$dist_dir" \
+            --clean \
+            ${TRTLLM_DSV4_BUILD_ARGS:-}
+
+        local wheel
+        wheel="$(ls -t "$dist_dir"/tensorrt_llm*.whl | head -1)"
+        python3 -m pip install --force-reinstall --no-deps "$wheel"
+    ) 9>"$lock_file"
+
+    trtllm_dsv4_supported
+}
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -2038,3 +2038,11 @@
     - updated sglang container image 
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1027
 
+- config-keys:
+    - dsv4-fp4-b200-trt
+    - dsv4-fp4-b300-trt
+  description:
+    - "Add DeepSeek-V4-Pro FP4 TRTLLM single-node bring-up configs on B200 and B300"
+    - "Use ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-f1c5fe1, built from NVIDIA/TensorRT-LLM@feat/deepseek_v4 (f1c5fe1), with a guarded source-build fallback"
+    - "Serve DeepSeek-V4-Pro through TRTLLM's deepseek_v4 tokenizer and /v1/chat/completions with TP8, EP8, STP-only conc 1-8 for 1k1k and 8k1k"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1233