Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
213 changes: 149 additions & 64 deletions python/sglang/multimodal_gen/benchmarks/compare_perf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import argparse
import json
import os
import re
from datetime import datetime
from typing import Any, Dict, List, Tuple
Expand Down Expand Up @@ -108,44 +109,40 @@ def _load_benchmark_file(file_path: str) -> Dict[str, Any]:
return json.load(f)


def compare_benchmarks(
baseline_path: str, new_path: str, output_format: str = "markdown"
):
"""
Compares two benchmark JSON files and prints a report.
"""
try:
base_data = _load_benchmark_file(baseline_path)
new_data = _load_benchmark_file(new_path)
except Exception as e:
print(f"Error loading benchmark files: {e}")
return

base_e2e = base_data.get("total_duration_ms", 0)
new_e2e = new_data.get("total_duration_ms", 0)

diff_ms, diff_pct = calculate_diff(base_e2e, new_e2e)

def _get_status_emoji_from_diff_percent(diff_pct):
if diff_pct < -2.0:
status = "✅"
return "✅"
elif diff_pct > 2.0:
status = "❌"
return "❌"
else:
status = ""
return "⚪️"

# --- Stage Breakdown ---
base_durations, base_order, base_counts = consolidate_steps(
base_data.get("steps", [])

def _print_single_comparison_report(
others_data, base_e2e, combined_order, base_durations, others_processed, base_counts
):
new_data = others_data[0]
new_e2e = new_data.get("total_duration_ms", 0)
diff_ms, diff_pct = calculate_diff(base_e2e, new_e2e)
status = _get_status_emoji_from_diff_percent(diff_pct)

print("#### 1. High-level Summary")
print("| Metric | Baseline | New | Diff | Status |")
print("| :--- | :--- | :--- | :--- | :--- |")
print(
f"| **E2E Latency** | {base_e2e:.2f} ms | {new_e2e:.2f} ms | **{diff_ms:+.2f} ms ({diff_pct:+.1f}%)** | {status} |"
)
new_durations, new_order, new_counts = consolidate_steps(new_data.get("steps", []))
print(
f"| **Throughput** | {1000 / base_e2e if base_e2e else 0:.2f} req/s | {1000 / new_e2e if new_e2e else 0:.2f} req/s | - | - |"
)
print("\n")

# Merge orders: Start with New order (execution order), append any missing from Base
combined_order = list(new_order)
for name in base_order:
if name not in combined_order:
combined_order.append(name)
print("#### 2. Stage Breakdown")
print("| Stage Name | Baseline (ms) | New (ms) | Diff (ms) | Diff (%) | Status |")
print("| :--- | :--- | :--- | :--- | :--- | :--- |")

new_durations, _, new_counts = others_processed[0]

stage_rows = []
for stage in combined_order:
b_val = base_durations.get(stage, 0.0)
n_val = new_durations.get(stage, 0.0)
Expand All @@ -154,7 +151,6 @@ def compare_benchmarks(

s_diff, s_pct = calculate_diff(b_val, n_val)

# Format count string if aggregated
count_str = ""
if stage == "Denoising Loop":
count_str = (
Expand All @@ -163,54 +159,143 @@ def compare_benchmarks(
else f" ({b_count}->{n_count} steps)"
)

# filter noise: show if diff is > 0.5ms OR if it's a major stage (like Denoising Loop)
# always show Denoising Loop or stages with significant duration/diff
stage_rows.append((stage + count_str, b_val, n_val, s_diff, s_pct))
status_emoji = get_perf_status_emoji(b_val, n_val)
print(
f"| {stage}{count_str} | {b_val:.2f} | {n_val:.2f} | {s_diff:+.2f} | {s_pct:+.1f}% | {status_emoji} |"
)


def _print_multi_comparison_report(
base_e2e,
others_data,
other_labels,
combined_order,
base_durations,
others_processed,
):
print("#### 1. High-level Summary")
header = "| Metric | Baseline | " + " | ".join(other_labels) + " |"
sep = "| :--- | :--- | " + " | ".join([":---"] * len(other_labels)) + " |"
print(header)
print(sep)

# E2E Row
row_e2e = f"| **E2E Latency** | {base_e2e:.2f} ms |"
for i, d in enumerate(others_data):
val = d.get("total_duration_ms", 0)
diff_ms, diff_pct = calculate_diff(base_e2e, val)

status = _get_status_emoji_from_diff_percent(diff_pct)

row_e2e += f" {val:.2f} ms ({diff_pct:+.1f}%) {status} |"
print(row_e2e)
print("\n")

print("#### 2. Stage Breakdown")
# Header: Stage | Baseline | Label1 | Label2 ...
header = "| Stage Name | Baseline | " + " | ".join(other_labels) + " |"
sep = "| :--- | :--- | " + " | ".join([":---"] * len(other_labels)) + " |"
print(header)
print(sep)

for stage in combined_order:
b_val = base_durations.get(stage, 0.0)
row_str = f"| {stage} | {b_val:.2f} |"

for i, (n_durations, _, n_counts) in enumerate(others_processed):
n_val = n_durations.get(stage, 0.0)
_, s_pct = calculate_diff(b_val, n_val)
status_emoji = get_perf_status_emoji(b_val, n_val)

row_str += f" {n_val:.2f} ({s_pct:+.1f}%) {status_emoji} |"
print(row_str)


def compare_benchmarks(file_paths: List[str], output_format: str = "markdown"):
"""
Compares benchmark JSON files and prints a report.
First file is baseline, others will be compared against it.
"""
if len(file_paths) < 2:
print("Error: Need at least 2 files to compare.")
return

try:
data_list = [_load_benchmark_file(f) for f in file_paths]
except Exception as e:
print(f"Error loading benchmark files: {e}")
return

base_data = data_list[0]
others_data = data_list[1:]

# Use filenames as labels if multiple comparisons, else just "New"
other_labels = [os.path.basename(p) for p in file_paths[1:]]

base_e2e = base_data.get("total_duration_ms", 0)

base_durations, base_order, base_counts = consolidate_steps(
base_data.get("steps", [])
)

others_processed = []
for d in others_data:
dur, order, counts = consolidate_steps(d.get("steps", []))
others_processed.append((dur, order, counts))

combined_order = []
# Collect all unique stages maintaining order from newest to baseline
for _, order, _ in reversed(others_processed):
for name in order:
if name not in combined_order:
combined_order.append(name)
for name in base_order:
if name not in combined_order:
combined_order.append(name)

if output_format == "markdown":
print("### Performance Comparison Report\n")

# Summary Table
print("#### 1. High-level Summary")
print("| Metric | Baseline | New | Diff | Status |")
print("| :--- | :--- | :--- | :--- | :--- |")
print(
f"| **E2E Latency** | {base_e2e:.2f} ms | {new_e2e:.2f} ms | **{diff_ms:+.2f} ms ({diff_pct:+.1f}%)** | {status} |"
)
print(
f"| **Throughput** | {1000 / base_e2e if base_e2e else 0:.2f} req/s | {1000 / new_e2e if new_e2e else 0:.2f} req/s | - | - |"
)
print("\n")

# Detailed Breakdown
print("#### 2. Stage Breakdown")
print(
"| Stage Name | Baseline (ms) | New (ms) | Diff (ms) | Diff (%) | Status |"
)
print("| :--- | :--- | :--- | :--- | :--- | :--- |")
for name, b, n, d, p in stage_rows:
name_str = name
status_emoji = get_perf_status_emoji(b, n)
print(
f"| {name_str} | {b:.2f} | {n:.2f} | {d:+.2f} | {p:+.1f}% | {status_emoji} |"
if len(others_data) == 1:
_print_single_comparison_report(
others_data,
base_e2e,
combined_order,
base_durations,
others_processed,
base_counts,
)
else:
_print_multi_comparison_report(
base_e2e,
others_data,
other_labels,
combined_order,
base_durations,
others_processed,
)
print("\n")

print("\n")
# Metadata
print("<details>")
print("<summary>Metadata</summary>\n")
print(f"- Baseline Commit: `{base_data.get('commit_hash', 'N/A')}`")
print(f"- New Commit: `{new_data.get('commit_hash', 'N/A')}`")
for i, d in enumerate(others_data):
label = "New" if len(others_data) == 1 else other_labels[i]
print(f"- {label} Commit: `{d.get('commit_hash', 'N/A')}`")
print(f"- Timestamp: {datetime.now().isoformat()}")
print("</details>")


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Compare two sglang-diffusion performance JSON files."
description="Compare sglang-diffusion performance JSON files."
)
parser.add_argument(
"files",
nargs="+",
help="List of JSON files. First is baseline, others are compared against it.",
)
parser.add_argument("baseline", help="Path to the baseline JSON file")
parser.add_argument("new", help="Path to the new JSON file")
args = parser.parse_args()

compare_benchmarks(args.baseline, args.new)
compare_benchmarks(args.files)
2 changes: 1 addition & 1 deletion python/sglang/multimodal_gen/docs/contributing.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ For PRs that impact **latency**, **throughput**, or **memory usage**, you **shou

3. **Compare**: run the compare script, which will print a Markdown table to the console
```bash
$ python python/sglang/multimodal_gen/benchmarks/compare_perf.py baseline.json new.json
$ python python/sglang/multimodal_gen/benchmarks/compare_perf.py baseline.json new.json [new2.json ...]
### Performance Comparison Report
...
```
Expand Down
Loading