diff --git a/python/sglang/multimodal_gen/benchmarks/compare_perf.py b/python/sglang/multimodal_gen/benchmarks/compare_perf.py index 2dfb087c79d2..d600d06d2b10 100644 --- a/python/sglang/multimodal_gen/benchmarks/compare_perf.py +++ b/python/sglang/multimodal_gen/benchmarks/compare_perf.py @@ -1,5 +1,6 @@ import argparse import json +import os import re from datetime import datetime from typing import Any, Dict, List, Tuple @@ -108,44 +109,40 @@ def _load_benchmark_file(file_path: str) -> Dict[str, Any]: return json.load(f) -def compare_benchmarks( - baseline_path: str, new_path: str, output_format: str = "markdown" -): - """ - Compares two benchmark JSON files and prints a report. - """ - try: - base_data = _load_benchmark_file(baseline_path) - new_data = _load_benchmark_file(new_path) - except Exception as e: - print(f"Error loading benchmark files: {e}") - return - - base_e2e = base_data.get("total_duration_ms", 0) - new_e2e = new_data.get("total_duration_ms", 0) - - diff_ms, diff_pct = calculate_diff(base_e2e, new_e2e) - +def _get_status_emoji_from_diff_percent(diff_pct): if diff_pct < -2.0: - status = "✅" + return "✅" elif diff_pct > 2.0: - status = "❌" + return "❌" else: - status = "" + return "⚪️" - # --- Stage Breakdown --- - base_durations, base_order, base_counts = consolidate_steps( - base_data.get("steps", []) + +def _print_single_comparison_report( + others_data, base_e2e, combined_order, base_durations, others_processed, base_counts +): + new_data = others_data[0] + new_e2e = new_data.get("total_duration_ms", 0) + diff_ms, diff_pct = calculate_diff(base_e2e, new_e2e) + status = _get_status_emoji_from_diff_percent(diff_pct) + + print("#### 1. High-level Summary") + print("| Metric | Baseline | New | Diff | Status |") + print("| :--- | :--- | :--- | :--- | :--- |") + print( + f"| **E2E Latency** | {base_e2e:.2f} ms | {new_e2e:.2f} ms | **{diff_ms:+.2f} ms ({diff_pct:+.1f}%)** | {status} |" ) - new_durations, new_order, new_counts = consolidate_steps(new_data.get("steps", [])) + print( + f"| **Throughput** | {1000 / base_e2e if base_e2e else 0:.2f} req/s | {1000 / new_e2e if new_e2e else 0:.2f} req/s | - | - |" + ) + print("\n") - # Merge orders: Start with New order (execution order), append any missing from Base - combined_order = list(new_order) - for name in base_order: - if name not in combined_order: - combined_order.append(name) + print("#### 2. Stage Breakdown") + print("| Stage Name | Baseline (ms) | New (ms) | Diff (ms) | Diff (%) | Status |") + print("| :--- | :--- | :--- | :--- | :--- | :--- |") + + new_durations, _, new_counts = others_processed[0] - stage_rows = [] for stage in combined_order: b_val = base_durations.get(stage, 0.0) n_val = new_durations.get(stage, 0.0) @@ -154,7 +151,6 @@ def compare_benchmarks( s_diff, s_pct = calculate_diff(b_val, n_val) - # Format count string if aggregated count_str = "" if stage == "Denoising Loop": count_str = ( @@ -163,54 +159,143 @@ def compare_benchmarks( else f" ({b_count}->{n_count} steps)" ) - # filter noise: show if diff is > 0.5ms OR if it's a major stage (like Denoising Loop) - # always show Denoising Loop or stages with significant duration/diff - stage_rows.append((stage + count_str, b_val, n_val, s_diff, s_pct)) + status_emoji = get_perf_status_emoji(b_val, n_val) + print( + f"| {stage}{count_str} | {b_val:.2f} | {n_val:.2f} | {s_diff:+.2f} | {s_pct:+.1f}% | {status_emoji} |" + ) + + +def _print_multi_comparison_report( + base_e2e, + others_data, + other_labels, + combined_order, + base_durations, + others_processed, +): + print("#### 1. High-level Summary") + header = "| Metric | Baseline | " + " | ".join(other_labels) + " |" + sep = "| :--- | :--- | " + " | ".join([":---"] * len(other_labels)) + " |" + print(header) + print(sep) + + # E2E Row + row_e2e = f"| **E2E Latency** | {base_e2e:.2f} ms |" + for i, d in enumerate(others_data): + val = d.get("total_duration_ms", 0) + diff_ms, diff_pct = calculate_diff(base_e2e, val) + + status = _get_status_emoji_from_diff_percent(diff_pct) + + row_e2e += f" {val:.2f} ms ({diff_pct:+.1f}%) {status} |" + print(row_e2e) + print("\n") + + print("#### 2. Stage Breakdown") + # Header: Stage | Baseline | Label1 | Label2 ... + header = "| Stage Name | Baseline | " + " | ".join(other_labels) + " |" + sep = "| :--- | :--- | " + " | ".join([":---"] * len(other_labels)) + " |" + print(header) + print(sep) + + for stage in combined_order: + b_val = base_durations.get(stage, 0.0) + row_str = f"| {stage} | {b_val:.2f} |" + + for i, (n_durations, _, n_counts) in enumerate(others_processed): + n_val = n_durations.get(stage, 0.0) + _, s_pct = calculate_diff(b_val, n_val) + status_emoji = get_perf_status_emoji(b_val, n_val) + + row_str += f" {n_val:.2f} ({s_pct:+.1f}%) {status_emoji} |" + print(row_str) + + +def compare_benchmarks(file_paths: List[str], output_format: str = "markdown"): + """ + Compares benchmark JSON files and prints a report. + First file is baseline, others will be compared against it. + """ + if len(file_paths) < 2: + print("Error: Need at least 2 files to compare.") + return + + try: + data_list = [_load_benchmark_file(f) for f in file_paths] + except Exception as e: + print(f"Error loading benchmark files: {e}") + return + + base_data = data_list[0] + others_data = data_list[1:] + + # Use filenames as labels if multiple comparisons, else just "New" + other_labels = [os.path.basename(p) for p in file_paths[1:]] + + base_e2e = base_data.get("total_duration_ms", 0) + + base_durations, base_order, base_counts = consolidate_steps( + base_data.get("steps", []) + ) + + others_processed = [] + for d in others_data: + dur, order, counts = consolidate_steps(d.get("steps", [])) + others_processed.append((dur, order, counts)) + + combined_order = [] + # Collect all unique stages maintaining order from newest to baseline + for _, order, _ in reversed(others_processed): + for name in order: + if name not in combined_order: + combined_order.append(name) + for name in base_order: + if name not in combined_order: + combined_order.append(name) if output_format == "markdown": print("### Performance Comparison Report\n") - # Summary Table - print("#### 1. High-level Summary") - print("| Metric | Baseline | New | Diff | Status |") - print("| :--- | :--- | :--- | :--- | :--- |") - print( - f"| **E2E Latency** | {base_e2e:.2f} ms | {new_e2e:.2f} ms | **{diff_ms:+.2f} ms ({diff_pct:+.1f}%)** | {status} |" - ) - print( - f"| **Throughput** | {1000 / base_e2e if base_e2e else 0:.2f} req/s | {1000 / new_e2e if new_e2e else 0:.2f} req/s | - | - |" - ) - print("\n") - - # Detailed Breakdown - print("#### 2. Stage Breakdown") - print( - "| Stage Name | Baseline (ms) | New (ms) | Diff (ms) | Diff (%) | Status |" - ) - print("| :--- | :--- | :--- | :--- | :--- | :--- |") - for name, b, n, d, p in stage_rows: - name_str = name - status_emoji = get_perf_status_emoji(b, n) - print( - f"| {name_str} | {b:.2f} | {n:.2f} | {d:+.2f} | {p:+.1f}% | {status_emoji} |" + if len(others_data) == 1: + _print_single_comparison_report( + others_data, + base_e2e, + combined_order, + base_durations, + others_processed, + base_counts, + ) + else: + _print_multi_comparison_report( + base_e2e, + others_data, + other_labels, + combined_order, + base_durations, + others_processed, ) - print("\n") + print("\n") # Metadata print("
") print("Metadata\n") print(f"- Baseline Commit: `{base_data.get('commit_hash', 'N/A')}`") - print(f"- New Commit: `{new_data.get('commit_hash', 'N/A')}`") + for i, d in enumerate(others_data): + label = "New" if len(others_data) == 1 else other_labels[i] + print(f"- {label} Commit: `{d.get('commit_hash', 'N/A')}`") print(f"- Timestamp: {datetime.now().isoformat()}") print("
") if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Compare two sglang-diffusion performance JSON files." + description="Compare sglang-diffusion performance JSON files." + ) + parser.add_argument( + "files", + nargs="+", + help="List of JSON files. First is baseline, others are compared against it.", ) - parser.add_argument("baseline", help="Path to the baseline JSON file") - parser.add_argument("new", help="Path to the new JSON file") args = parser.parse_args() - compare_benchmarks(args.baseline, args.new) + compare_benchmarks(args.files) diff --git a/python/sglang/multimodal_gen/docs/contributing.md b/python/sglang/multimodal_gen/docs/contributing.md index fb8b4456b421..33a4699b8fa2 100644 --- a/python/sglang/multimodal_gen/docs/contributing.md +++ b/python/sglang/multimodal_gen/docs/contributing.md @@ -39,7 +39,7 @@ For PRs that impact **latency**, **throughput**, or **memory usage**, you **shou 3. **Compare**: run the compare script, which will print a Markdown table to the console ```bash - $ python python/sglang/multimodal_gen/benchmarks/compare_perf.py baseline.json new.json + $ python python/sglang/multimodal_gen/benchmarks/compare_perf.py baseline.json new.json [new2.json ...] ### Performance Comparison Report ... ```