-
Notifications
You must be signed in to change notification settings - Fork 95
How can we reproduce the fact that using kvcached is 2-28 times faster than not using kvcached in TTFT? #233
Description
Where are the service startup and benchmark scripts located?
GPU =80G-H800 ,model=/test1/weights/Qwen3-8B
I'm using the bench_latency_benefit folder to start and test the service.
with kvcached
python controller/launch.py --config benchmarks/bench_latency_benefit/bench-config-vllm-llama-8b-with-kvcached.yaml
kvcached: # kvcached environment variables for all instances
kvcached_gpu_utilization: 0.95
kvcached_page_prealloc_enabled: true
kvcached_min_reserved_pages: 5
kvcached_max_reserved_pages: 10
kvcached_sanity_check: false
kvcached_log_level: INFO
router: # router configuration
enable_router: true
router_port: 8000
router_host: localhost
sleep_manager: # sleep configuration
idle_threshold_seconds: 300 # 5 minutes - time before putting idle models to sleep
check_interval_seconds: 60 # Check every minute for idle models
auto_sleep_enabled: false # Enable automatic sleep mode for idle models
wakeup_on_request: true # Automatically wake models when requests come in
min_sleep_duration: 80 # Minimum time to keep model asleep (2 minutes)
#launch_delay_seconds: 30 # Delay between launching each instance
instances: # instances configuration
-
name: instance1
model: /test1/weights/Qwen3-8B
engine: vllm
using_venv: true
venv_path: /workspace/kvcached/engine_integration/vllm-source-venv
kvcached_env: # kvcached environment variables for this instance- "ENABLE_KVCACHED=true"
- "KVCACHED_AUTOPATCH=1"
engine_env: - "VLLM_USE_V1=1"
- "VLLM_ATTENTION_BACKEND=FLASH_ATTN"
- "VLLM_SERVER_DEV_MODE=1" # To enable model sleep
engine_args: - "--disable-log-requests"
- "--no-enable-prefix-caching"
- "--host=localhost"
- "--port=12346"
- "--gpu-memory-utilization 0.31"
- "--max-model-len 2048"
-
name: instance2
#model: /test1/weights/DeepSeek-R1-Distill-Qwen-1.5B
model: /test1/weights/Qwen3-8B
engine: vllm
using_venv: true
venv_path: /workspace/kvcached/engine_integration/vllm-source-venv
kvcached_env: # kvcached environment variables for this instance- "ENABLE_KVCACHED=true"
- "KVCACHED_AUTOPATCH=1"
engine_env: - "VLLM_USE_V1=1"
- "VLLM_ATTENTION_BACKEND=FLASH_ATTN"
- "VLLM_SERVER_DEV_MODE=1" # To enable model sleep
engine_args: - "--disable-log-requests"
- "--no-enable-prefix-caching"
- "--host=localhost"
- "--port=30000"
- "--gpu-memory-utilization 0.31"
- "--max-model-len 2048"
-
name: instance3
#model: /test1/weights/DeepSeek-R1-Distill-Qwen-1.5B
model: /test1/weights/Qwen3-8B
engine: vllm
using_venv: true
venv_path: /workspace/kvcached/engine_integration/vllm-source-venv
kvcached_env: # kvcached environment variables for this instance- "ENABLE_KVCACHED=true"
- "KVCACHED_AUTOPATCH=1"
engine_env: - "VLLM_USE_V1=1"
- "VLLM_ATTENTION_BACKEND=FLASH_ATTN"
- "VLLM_SERVER_DEV_MODE=1" # To enable model sleep
engine_args: - "--disable-log-requests"
- "--no-enable-prefix-caching"
- "--host=localhost"
- "--port=40000"
- "--gpu-memory-utilization 0.31"
- "--max-model-len 2048"
without kvcached
source /workspace/kvcached/engine_integration/vllm-source-venv/bin/activate
export VLLM_USE_V1=1
export VLLM_ATTENTION_BACKEND=FLASH_ATTN
export ENABLE_KVCACHED=false
export KVCACHED_AUTOPATCH=0
CUDA_VISIBLE_DEVICES=7
vllm serve /test1/weights/Qwen3-8B
--disable-log-requests
--no-enable-prefix-caching
--host=localhost
--port=22347
--gpu-memory-utilization 0.3
--tensor-parallel-size 1
--max-model-len 2028
source /workspace/kvcached/engine_integration/vllm-source-venv/bin/activate
export VLLM_USE_V1=1
export VLLM_ATTENTION_BACKEND=FLASH_ATTN
export ENABLE_KVCACHED=false
export KVCACHED_AUTOPATCH=0
CUDA_VISIBLE_DEVICES=7
vllm serve /test1/weights/Qwen3-8B
--disable-log-requests
--no-enable-prefix-caching
--host=localhost
--port=22348
--gpu-memory-utilization 0.3
--tensor-parallel-size 1
--max-model-len 2028
source /workspace/kvcached/engine_integration/vllm-source-venv/bin/activate
export VLLM_USE_V1=1
export VLLM_ATTENTION_BACKEND=FLASH_ATTN
export ENABLE_KVCACHED=false
export KVCACHED_AUTOPATCH=0
CUDA_VISIBLE_DEVICES=7
vllm serve /test1/weights/Qwen3-8B
--disable-log-requests
--no-enable-prefix-caching
--host=localhost
--port=22349
--gpu-memory-utilization 0.3
--tensor-parallel-size 1
--max-model-len 2048
I set --max-model-len 2048 to ensure the smooth startup of all three models.
benchmark:bash benchmarks/bench_latency_benefit/run_benchmark.sh
I'm using benchmarks/bench_latency_benefit/run_benchmark.sh for benchmark testing, but all the results show that TTFT without kvcached is faster than with kvcached.
My requirement is to reproduce the 2-28x speedup of TTFT.