Skip to content

Commit f7e7046

Browse files
Brain97shuxiguomickqian
authored andcommitted
[diffusion] feat: support comparing batch perf (sgl-project#14738)
Co-authored-by: shuxiguo <shuxiguo@meituan.com> Co-authored-by: Mick <mickjagger19@icloud.com>
1 parent 7dc8faa commit f7e7046

File tree

2 files changed

+150
-65
lines changed

2 files changed

+150
-65
lines changed

python/sglang/multimodal_gen/benchmarks/compare_perf.py

Lines changed: 149 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import argparse
22
import json
3+
import os
34
import re
45
from datetime import datetime
56
from typing import Any, Dict, List, Tuple
@@ -108,44 +109,40 @@ def _load_benchmark_file(file_path: str) -> Dict[str, Any]:
108109
return json.load(f)
109110

110111

111-
def compare_benchmarks(
112-
baseline_path: str, new_path: str, output_format: str = "markdown"
113-
):
114-
"""
115-
Compares two benchmark JSON files and prints a report.
116-
"""
117-
try:
118-
base_data = _load_benchmark_file(baseline_path)
119-
new_data = _load_benchmark_file(new_path)
120-
except Exception as e:
121-
print(f"Error loading benchmark files: {e}")
122-
return
123-
124-
base_e2e = base_data.get("total_duration_ms", 0)
125-
new_e2e = new_data.get("total_duration_ms", 0)
126-
127-
diff_ms, diff_pct = calculate_diff(base_e2e, new_e2e)
128-
112+
def _get_status_emoji_from_diff_percent(diff_pct):
129113
if diff_pct < -2.0:
130-
status = "✅"
114+
return "✅"
131115
elif diff_pct > 2.0:
132-
status = "❌"
116+
return "❌"
133117
else:
134-
status = ""
118+
return "⚪️"
135119

136-
# --- Stage Breakdown ---
137-
base_durations, base_order, base_counts = consolidate_steps(
138-
base_data.get("steps", [])
120+
121+
def _print_single_comparison_report(
122+
others_data, base_e2e, combined_order, base_durations, others_processed, base_counts
123+
):
124+
new_data = others_data[0]
125+
new_e2e = new_data.get("total_duration_ms", 0)
126+
diff_ms, diff_pct = calculate_diff(base_e2e, new_e2e)
127+
status = _get_status_emoji_from_diff_percent(diff_pct)
128+
129+
print("#### 1. High-level Summary")
130+
print("| Metric | Baseline | New | Diff | Status |")
131+
print("| :--- | :--- | :--- | :--- | :--- |")
132+
print(
133+
f"| **E2E Latency** | {base_e2e:.2f} ms | {new_e2e:.2f} ms | **{diff_ms:+.2f} ms ({diff_pct:+.1f}%)** | {status} |"
139134
)
140-
new_durations, new_order, new_counts = consolidate_steps(new_data.get("steps", []))
135+
print(
136+
f"| **Throughput** | {1000 / base_e2e if base_e2e else 0:.2f} req/s | {1000 / new_e2e if new_e2e else 0:.2f} req/s | - | - |"
137+
)
138+
print("\n")
141139

142-
# Merge orders: Start with New order (execution order), append any missing from Base
143-
combined_order = list(new_order)
144-
for name in base_order:
145-
if name not in combined_order:
146-
combined_order.append(name)
140+
print("#### 2. Stage Breakdown")
141+
print("| Stage Name | Baseline (ms) | New (ms) | Diff (ms) | Diff (%) | Status |")
142+
print("| :--- | :--- | :--- | :--- | :--- | :--- |")
143+
144+
new_durations, _, new_counts = others_processed[0]
147145

148-
stage_rows = []
149146
for stage in combined_order:
150147
b_val = base_durations.get(stage, 0.0)
151148
n_val = new_durations.get(stage, 0.0)
@@ -154,7 +151,6 @@ def compare_benchmarks(
154151

155152
s_diff, s_pct = calculate_diff(b_val, n_val)
156153

157-
# Format count string if aggregated
158154
count_str = ""
159155
if stage == "Denoising Loop":
160156
count_str = (
@@ -163,54 +159,143 @@ def compare_benchmarks(
163159
else f" ({b_count}->{n_count} steps)"
164160
)
165161

166-
# filter noise: show if diff is > 0.5ms OR if it's a major stage (like Denoising Loop)
167-
# always show Denoising Loop or stages with significant duration/diff
168-
stage_rows.append((stage + count_str, b_val, n_val, s_diff, s_pct))
162+
status_emoji = get_perf_status_emoji(b_val, n_val)
163+
print(
164+
f"| {stage}{count_str} | {b_val:.2f} | {n_val:.2f} | {s_diff:+.2f} | {s_pct:+.1f}% | {status_emoji} |"
165+
)
166+
167+
168+
def _print_multi_comparison_report(
169+
base_e2e,
170+
others_data,
171+
other_labels,
172+
combined_order,
173+
base_durations,
174+
others_processed,
175+
):
176+
print("#### 1. High-level Summary")
177+
header = "| Metric | Baseline | " + " | ".join(other_labels) + " |"
178+
sep = "| :--- | :--- | " + " | ".join([":---"] * len(other_labels)) + " |"
179+
print(header)
180+
print(sep)
181+
182+
# E2E Row
183+
row_e2e = f"| **E2E Latency** | {base_e2e:.2f} ms |"
184+
for i, d in enumerate(others_data):
185+
val = d.get("total_duration_ms", 0)
186+
diff_ms, diff_pct = calculate_diff(base_e2e, val)
187+
188+
status = _get_status_emoji_from_diff_percent(diff_pct)
189+
190+
row_e2e += f" {val:.2f} ms ({diff_pct:+.1f}%) {status} |"
191+
print(row_e2e)
192+
print("\n")
193+
194+
print("#### 2. Stage Breakdown")
195+
# Header: Stage | Baseline | Label1 | Label2 ...
196+
header = "| Stage Name | Baseline | " + " | ".join(other_labels) + " |"
197+
sep = "| :--- | :--- | " + " | ".join([":---"] * len(other_labels)) + " |"
198+
print(header)
199+
print(sep)
200+
201+
for stage in combined_order:
202+
b_val = base_durations.get(stage, 0.0)
203+
row_str = f"| {stage} | {b_val:.2f} |"
204+
205+
for i, (n_durations, _, n_counts) in enumerate(others_processed):
206+
n_val = n_durations.get(stage, 0.0)
207+
_, s_pct = calculate_diff(b_val, n_val)
208+
status_emoji = get_perf_status_emoji(b_val, n_val)
209+
210+
row_str += f" {n_val:.2f} ({s_pct:+.1f}%) {status_emoji} |"
211+
print(row_str)
212+
213+
214+
def compare_benchmarks(file_paths: List[str], output_format: str = "markdown"):
215+
"""
216+
Compares benchmark JSON files and prints a report.
217+
First file is baseline, others will be compared against it.
218+
"""
219+
if len(file_paths) < 2:
220+
print("Error: Need at least 2 files to compare.")
221+
return
222+
223+
try:
224+
data_list = [_load_benchmark_file(f) for f in file_paths]
225+
except Exception as e:
226+
print(f"Error loading benchmark files: {e}")
227+
return
228+
229+
base_data = data_list[0]
230+
others_data = data_list[1:]
231+
232+
# Use filenames as labels if multiple comparisons, else just "New"
233+
other_labels = [os.path.basename(p) for p in file_paths[1:]]
234+
235+
base_e2e = base_data.get("total_duration_ms", 0)
236+
237+
base_durations, base_order, base_counts = consolidate_steps(
238+
base_data.get("steps", [])
239+
)
240+
241+
others_processed = []
242+
for d in others_data:
243+
dur, order, counts = consolidate_steps(d.get("steps", []))
244+
others_processed.append((dur, order, counts))
245+
246+
combined_order = []
247+
# Collect all unique stages maintaining order from newest to baseline
248+
for _, order, _ in reversed(others_processed):
249+
for name in order:
250+
if name not in combined_order:
251+
combined_order.append(name)
252+
for name in base_order:
253+
if name not in combined_order:
254+
combined_order.append(name)
169255

170256
if output_format == "markdown":
171257
print("### Performance Comparison Report\n")
172258

173-
# Summary Table
174-
print("#### 1. High-level Summary")
175-
print("| Metric | Baseline | New | Diff | Status |")
176-
print("| :--- | :--- | :--- | :--- | :--- |")
177-
print(
178-
f"| **E2E Latency** | {base_e2e:.2f} ms | {new_e2e:.2f} ms | **{diff_ms:+.2f} ms ({diff_pct:+.1f}%)** | {status} |"
179-
)
180-
print(
181-
f"| **Throughput** | {1000 / base_e2e if base_e2e else 0:.2f} req/s | {1000 / new_e2e if new_e2e else 0:.2f} req/s | - | - |"
182-
)
183-
print("\n")
184-
185-
# Detailed Breakdown
186-
print("#### 2. Stage Breakdown")
187-
print(
188-
"| Stage Name | Baseline (ms) | New (ms) | Diff (ms) | Diff (%) | Status |"
189-
)
190-
print("| :--- | :--- | :--- | :--- | :--- | :--- |")
191-
for name, b, n, d, p in stage_rows:
192-
name_str = name
193-
status_emoji = get_perf_status_emoji(b, n)
194-
print(
195-
f"| {name_str} | {b:.2f} | {n:.2f} | {d:+.2f} | {p:+.1f}% | {status_emoji} |"
259+
if len(others_data) == 1:
260+
_print_single_comparison_report(
261+
others_data,
262+
base_e2e,
263+
combined_order,
264+
base_durations,
265+
others_processed,
266+
base_counts,
267+
)
268+
else:
269+
_print_multi_comparison_report(
270+
base_e2e,
271+
others_data,
272+
other_labels,
273+
combined_order,
274+
base_durations,
275+
others_processed,
196276
)
197-
print("\n")
198277

278+
print("\n")
199279
# Metadata
200280
print("<details>")
201281
print("<summary>Metadata</summary>\n")
202282
print(f"- Baseline Commit: `{base_data.get('commit_hash', 'N/A')}`")
203-
print(f"- New Commit: `{new_data.get('commit_hash', 'N/A')}`")
283+
for i, d in enumerate(others_data):
284+
label = "New" if len(others_data) == 1 else other_labels[i]
285+
print(f"- {label} Commit: `{d.get('commit_hash', 'N/A')}`")
204286
print(f"- Timestamp: {datetime.now().isoformat()}")
205287
print("</details>")
206288

207289

208290
if __name__ == "__main__":
209291
parser = argparse.ArgumentParser(
210-
description="Compare two sglang-diffusion performance JSON files."
292+
description="Compare sglang-diffusion performance JSON files."
293+
)
294+
parser.add_argument(
295+
"files",
296+
nargs="+",
297+
help="List of JSON files. First is baseline, others are compared against it.",
211298
)
212-
parser.add_argument("baseline", help="Path to the baseline JSON file")
213-
parser.add_argument("new", help="Path to the new JSON file")
214299
args = parser.parse_args()
215300

216-
compare_benchmarks(args.baseline, args.new)
301+
compare_benchmarks(args.files)

python/sglang/multimodal_gen/docs/contributing.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ For PRs that impact **latency**, **throughput**, or **memory usage**, you **shou
3939

4040
3. **Compare**: run the compare script, which will print a Markdown table to the console
4141
```bash
42-
$ python python/sglang/multimodal_gen/benchmarks/compare_perf.py baseline.json new.json
42+
$ python python/sglang/multimodal_gen/benchmarks/compare_perf.py baseline.json new.json [new2.json ...]
4343
### Performance Comparison Report
4444
...
4545
```

0 commit comments

Comments
 (0)