How can we reproduce the fact that using kvcached is 2-28 times faster than not using kvcached in TTFT?

Where are the service startup and benchmark scripts located?

GPU =80G-H800  ，model=/test1/weights/Qwen3-8B
I'm using the `bench_latency_benefit` folder to start and test the service.
 with kvcached  
python controller/launch.py --config benchmarks/bench_latency_benefit/bench-config-vllm-llama-8b-with-kvcached.yaml

kvcached: # kvcached environment variables for all instances
  kvcached_gpu_utilization: 0.95
  kvcached_page_prealloc_enabled: true
  kvcached_min_reserved_pages: 5
  kvcached_max_reserved_pages: 10
  kvcached_sanity_check: false
  kvcached_log_level: INFO

router: # router configuration
  enable_router: true
  router_port: 8000
  router_host: localhost

sleep_manager: # sleep configuration
  idle_threshold_seconds: 300        # 5 minutes - time before putting idle models to sleep
  check_interval_seconds: 60         # Check every minute for idle models
  auto_sleep_enabled: false           # Enable automatic sleep mode for idle models
  wakeup_on_request: true            # Automatically wake models when requests come in
  min_sleep_duration: 80             # Minimum time to keep model asleep (2 minutes)

#launch_delay_seconds: 30  # Delay between launching each instance

instances: # instances configuration
  - name: instance1
    model: /test1/weights/Qwen3-8B
    engine: vllm
    using_venv: true
    venv_path:  /workspace/kvcached/engine_integration/vllm-source-venv
    kvcached_env: # kvcached environment variables for this instance
      - "ENABLE_KVCACHED=true"
      - "KVCACHED_AUTOPATCH=1"
    engine_env:
      - "VLLM_USE_V1=1"
      - "VLLM_ATTENTION_BACKEND=FLASH_ATTN"
      - "VLLM_SERVER_DEV_MODE=1" # To enable model sleep
    engine_args:
      - "--disable-log-requests"
      - "--no-enable-prefix-caching"
      - "--host=localhost"
      - "--port=12346"
      # - "--gpu-memory-utilization 0.31"
      - "--max-model-len 2048"

  - name: instance2
    #model: /test1/weights/DeepSeek-R1-Distill-Qwen-1.5B
    model: /test1/weights/Qwen3-8B
    engine: vllm
    using_venv: true
    venv_path:  /workspace/kvcached/engine_integration/vllm-source-venv
    kvcached_env: # kvcached environment variables for this instance
      - "ENABLE_KVCACHED=true"
      - "KVCACHED_AUTOPATCH=1"
    engine_env:
      - "VLLM_USE_V1=1"
      - "VLLM_ATTENTION_BACKEND=FLASH_ATTN"
      - "VLLM_SERVER_DEV_MODE=1" # To enable model sleep
    engine_args:
      - "--disable-log-requests"
      - "--no-enable-prefix-caching"
      - "--host=localhost"
      - "--port=30000"
      # - "--gpu-memory-utilization 0.31"
      - "--max-model-len 2048"

  - name: instance3
    #model: /test1/weights/DeepSeek-R1-Distill-Qwen-1.5B
    model: /test1/weights/Qwen3-8B
    engine: vllm
    using_venv: true
    venv_path:  /workspace/kvcached/engine_integration/vllm-source-venv
    kvcached_env: # kvcached environment variables for this instance
      - "ENABLE_KVCACHED=true"
      - "KVCACHED_AUTOPATCH=1"
    engine_env:
      - "VLLM_USE_V1=1"
      - "VLLM_ATTENTION_BACKEND=FLASH_ATTN"
      - "VLLM_SERVER_DEV_MODE=1" # To enable model sleep
    engine_args:
      - "--disable-log-requests"
      - "--no-enable-prefix-caching"
      - "--host=localhost"
      - "--port=40000"
      # - "--gpu-memory-utilization 0.31"
      - "--max-model-len 2048"


 without kvcached  
source /workspace/kvcached/engine_integration/vllm-source-venv/bin/activate  
export VLLM_USE_V1=1  
export VLLM_ATTENTION_BACKEND=FLASH_ATTN  
export ENABLE_KVCACHED=false  
export KVCACHED_AUTOPATCH=0  

CUDA_VISIBLE_DEVICES=7  \
vllm serve /test1/weights/Qwen3-8B  \
    --disable-log-requests  \
    --no-enable-prefix-caching  \
    --host=localhost  \
    --port=22347  \
    --gpu-memory-utilization 0.3  \
    --tensor-parallel-size 1 \
    --max-model-len 2028


source /workspace/kvcached/engine_integration/vllm-source-venv/bin/activate  
export VLLM_USE_V1=1  
export VLLM_ATTENTION_BACKEND=FLASH_ATTN  
export ENABLE_KVCACHED=false  
export KVCACHED_AUTOPATCH=0  

CUDA_VISIBLE_DEVICES=7  \
vllm serve /test1/weights/Qwen3-8B  \
    --disable-log-requests  \
    --no-enable-prefix-caching  \
    --host=localhost  \
    --port=22348  \
    --gpu-memory-utilization 0.3  \
    --tensor-parallel-size 1 \
    --max-model-len 2028


source /workspace/kvcached/engine_integration/vllm-source-venv/bin/activate  
export VLLM_USE_V1=1  
export VLLM_ATTENTION_BACKEND=FLASH_ATTN  
export ENABLE_KVCACHED=false  
export KVCACHED_AUTOPATCH=0  

CUDA_VISIBLE_DEVICES=7  \
vllm serve /test1/weights/Qwen3-8B  \
    --disable-log-requests  \
    --no-enable-prefix-caching  \
    --host=localhost  \
    --port=22349  \
    --gpu-memory-utilization 0.3  \
    --tensor-parallel-size 1 \
    --max-model-len 2048

I set `--max-model-len 2048` to ensure the smooth startup of all three models.

benchmark：bash  benchmarks/bench_latency_benefit/run_benchmark.sh  
I'm using `benchmarks/bench_latency_benefit/run_benchmark.sh` for benchmark testing, but all the results show that TTFT without kvcached is faster than with kvcached.

My requirement is to reproduce the 2-28x speedup of TTFT.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

How can we reproduce the fact that using kvcached is 2-28 times faster than not using kvcached in TTFT? #233

- "--gpu-memory-utilization 0.31"

- "--gpu-memory-utilization 0.31"

- "--gpu-memory-utilization 0.31"

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

How can we reproduce the fact that using kvcached is 2-28 times faster than not using kvcached in TTFT? #233

Description

- "--gpu-memory-utilization 0.31"

- "--gpu-memory-utilization 0.31"

- "--gpu-memory-utilization 0.31"

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions