11import argparse
22import json
3+ import os
34import re
45from datetime import datetime
56from typing import Any , Dict , List , Tuple
@@ -108,44 +109,40 @@ def _load_benchmark_file(file_path: str) -> Dict[str, Any]:
108109 return json .load (f )
109110
110111
111- def compare_benchmarks (
112- baseline_path : str , new_path : str , output_format : str = "markdown"
113- ):
114- """
115- Compares two benchmark JSON files and prints a report.
116- """
117- try :
118- base_data = _load_benchmark_file (baseline_path )
119- new_data = _load_benchmark_file (new_path )
120- except Exception as e :
121- print (f"Error loading benchmark files: { e } " )
122- return
123-
124- base_e2e = base_data .get ("total_duration_ms" , 0 )
125- new_e2e = new_data .get ("total_duration_ms" , 0 )
126-
127- diff_ms , diff_pct = calculate_diff (base_e2e , new_e2e )
128-
112+ def _get_status_emoji_from_diff_percent (diff_pct ):
129113 if diff_pct < - 2.0 :
130- status = "✅"
114+ return "✅"
131115 elif diff_pct > 2.0 :
132- status = "❌"
116+ return "❌"
133117 else :
134- status = " "
118+ return "⚪️ "
135119
136- # --- Stage Breakdown ---
137- base_durations , base_order , base_counts = consolidate_steps (
138- base_data .get ("steps" , [])
120+
121+ def _print_single_comparison_report (
122+ others_data , base_e2e , combined_order , base_durations , others_processed , base_counts
123+ ):
124+ new_data = others_data [0 ]
125+ new_e2e = new_data .get ("total_duration_ms" , 0 )
126+ diff_ms , diff_pct = calculate_diff (base_e2e , new_e2e )
127+ status = _get_status_emoji_from_diff_percent (diff_pct )
128+
129+ print ("#### 1. High-level Summary" )
130+ print ("| Metric | Baseline | New | Diff | Status |" )
131+ print ("| :--- | :--- | :--- | :--- | :--- |" )
132+ print (
133+ f"| **E2E Latency** | { base_e2e :.2f} ms | { new_e2e :.2f} ms | **{ diff_ms :+.2f} ms ({ diff_pct :+.1f} %)** | { status } |"
139134 )
140- new_durations , new_order , new_counts = consolidate_steps (new_data .get ("steps" , []))
135+ print (
136+ f"| **Throughput** | { 1000 / base_e2e if base_e2e else 0 :.2f} req/s | { 1000 / new_e2e if new_e2e else 0 :.2f} req/s | - | - |"
137+ )
138+ print ("\n " )
141139
142- # Merge orders: Start with New order (execution order), append any missing from Base
143- combined_order = list ( new_order )
144- for name in base_order :
145- if name not in combined_order :
146- combined_order . append ( name )
140+ print ( "#### 2. Stage Breakdown" )
141+ print ( "| Stage Name | Baseline (ms) | New (ms) | Diff (ms) | Diff (%) | Status |" )
142+ print ( "| :--- | :--- | :--- | :--- | :--- | :--- |" )
143+
144+ new_durations , _ , new_counts = others_processed [ 0 ]
147145
148- stage_rows = []
149146 for stage in combined_order :
150147 b_val = base_durations .get (stage , 0.0 )
151148 n_val = new_durations .get (stage , 0.0 )
@@ -154,7 +151,6 @@ def compare_benchmarks(
154151
155152 s_diff , s_pct = calculate_diff (b_val , n_val )
156153
157- # Format count string if aggregated
158154 count_str = ""
159155 if stage == "Denoising Loop" :
160156 count_str = (
@@ -163,54 +159,143 @@ def compare_benchmarks(
163159 else f" ({ b_count } ->{ n_count } steps)"
164160 )
165161
166- # filter noise: show if diff is > 0.5ms OR if it's a major stage (like Denoising Loop)
167- # always show Denoising Loop or stages with significant duration/diff
168- stage_rows .append ((stage + count_str , b_val , n_val , s_diff , s_pct ))
162+ status_emoji = get_perf_status_emoji (b_val , n_val )
163+ print (
164+ f"| { stage } { count_str } | { b_val :.2f} | { n_val :.2f} | { s_diff :+.2f} | { s_pct :+.1f} % | { status_emoji } |"
165+ )
166+
167+
168+ def _print_multi_comparison_report (
169+ base_e2e ,
170+ others_data ,
171+ other_labels ,
172+ combined_order ,
173+ base_durations ,
174+ others_processed ,
175+ ):
176+ print ("#### 1. High-level Summary" )
177+ header = "| Metric | Baseline | " + " | " .join (other_labels ) + " |"
178+ sep = "| :--- | :--- | " + " | " .join ([":---" ] * len (other_labels )) + " |"
179+ print (header )
180+ print (sep )
181+
182+ # E2E Row
183+ row_e2e = f"| **E2E Latency** | { base_e2e :.2f} ms |"
184+ for i , d in enumerate (others_data ):
185+ val = d .get ("total_duration_ms" , 0 )
186+ diff_ms , diff_pct = calculate_diff (base_e2e , val )
187+
188+ status = _get_status_emoji_from_diff_percent (diff_pct )
189+
190+ row_e2e += f" { val :.2f} ms ({ diff_pct :+.1f} %) { status } |"
191+ print (row_e2e )
192+ print ("\n " )
193+
194+ print ("#### 2. Stage Breakdown" )
195+ # Header: Stage | Baseline | Label1 | Label2 ...
196+ header = "| Stage Name | Baseline | " + " | " .join (other_labels ) + " |"
197+ sep = "| :--- | :--- | " + " | " .join ([":---" ] * len (other_labels )) + " |"
198+ print (header )
199+ print (sep )
200+
201+ for stage in combined_order :
202+ b_val = base_durations .get (stage , 0.0 )
203+ row_str = f"| { stage } | { b_val :.2f} |"
204+
205+ for i , (n_durations , _ , n_counts ) in enumerate (others_processed ):
206+ n_val = n_durations .get (stage , 0.0 )
207+ _ , s_pct = calculate_diff (b_val , n_val )
208+ status_emoji = get_perf_status_emoji (b_val , n_val )
209+
210+ row_str += f" { n_val :.2f} ({ s_pct :+.1f} %) { status_emoji } |"
211+ print (row_str )
212+
213+
214+ def compare_benchmarks (file_paths : List [str ], output_format : str = "markdown" ):
215+ """
216+ Compares benchmark JSON files and prints a report.
217+ First file is baseline, others will be compared against it.
218+ """
219+ if len (file_paths ) < 2 :
220+ print ("Error: Need at least 2 files to compare." )
221+ return
222+
223+ try :
224+ data_list = [_load_benchmark_file (f ) for f in file_paths ]
225+ except Exception as e :
226+ print (f"Error loading benchmark files: { e } " )
227+ return
228+
229+ base_data = data_list [0 ]
230+ others_data = data_list [1 :]
231+
232+ # Use filenames as labels if multiple comparisons, else just "New"
233+ other_labels = [os .path .basename (p ) for p in file_paths [1 :]]
234+
235+ base_e2e = base_data .get ("total_duration_ms" , 0 )
236+
237+ base_durations , base_order , base_counts = consolidate_steps (
238+ base_data .get ("steps" , [])
239+ )
240+
241+ others_processed = []
242+ for d in others_data :
243+ dur , order , counts = consolidate_steps (d .get ("steps" , []))
244+ others_processed .append ((dur , order , counts ))
245+
246+ combined_order = []
247+ # Collect all unique stages maintaining order from newest to baseline
248+ for _ , order , _ in reversed (others_processed ):
249+ for name in order :
250+ if name not in combined_order :
251+ combined_order .append (name )
252+ for name in base_order :
253+ if name not in combined_order :
254+ combined_order .append (name )
169255
170256 if output_format == "markdown" :
171257 print ("### Performance Comparison Report\n " )
172258
173- # Summary Table
174- print ("#### 1. High-level Summary" )
175- print ("| Metric | Baseline | New | Diff | Status |" )
176- print ("| :--- | :--- | :--- | :--- | :--- |" )
177- print (
178- f"| **E2E Latency** | { base_e2e :.2f} ms | { new_e2e :.2f} ms | **{ diff_ms :+.2f} ms ({ diff_pct :+.1f} %)** | { status } |"
179- )
180- print (
181- f"| **Throughput** | { 1000 / base_e2e if base_e2e else 0 :.2f} req/s | { 1000 / new_e2e if new_e2e else 0 :.2f} req/s | - | - |"
182- )
183- print ("\n " )
184-
185- # Detailed Breakdown
186- print ("#### 2. Stage Breakdown" )
187- print (
188- "| Stage Name | Baseline (ms) | New (ms) | Diff (ms) | Diff (%) | Status |"
189- )
190- print ("| :--- | :--- | :--- | :--- | :--- | :--- |" )
191- for name , b , n , d , p in stage_rows :
192- name_str = name
193- status_emoji = get_perf_status_emoji (b , n )
194- print (
195- f"| { name_str } | { b :.2f} | { n :.2f} | { d :+.2f} | { p :+.1f} % | { status_emoji } |"
259+ if len (others_data ) == 1 :
260+ _print_single_comparison_report (
261+ others_data ,
262+ base_e2e ,
263+ combined_order ,
264+ base_durations ,
265+ others_processed ,
266+ base_counts ,
267+ )
268+ else :
269+ _print_multi_comparison_report (
270+ base_e2e ,
271+ others_data ,
272+ other_labels ,
273+ combined_order ,
274+ base_durations ,
275+ others_processed ,
196276 )
197- print ("\n " )
198277
278+ print ("\n " )
199279 # Metadata
200280 print ("<details>" )
201281 print ("<summary>Metadata</summary>\n " )
202282 print (f"- Baseline Commit: `{ base_data .get ('commit_hash' , 'N/A' )} `" )
203- print (f"- New Commit: `{ new_data .get ('commit_hash' , 'N/A' )} `" )
283+ for i , d in enumerate (others_data ):
284+ label = "New" if len (others_data ) == 1 else other_labels [i ]
285+ print (f"- { label } Commit: `{ d .get ('commit_hash' , 'N/A' )} `" )
204286 print (f"- Timestamp: { datetime .now ().isoformat ()} " )
205287 print ("</details>" )
206288
207289
208290if __name__ == "__main__" :
209291 parser = argparse .ArgumentParser (
210- description = "Compare two sglang-diffusion performance JSON files."
292+ description = "Compare sglang-diffusion performance JSON files."
293+ )
294+ parser .add_argument (
295+ "files" ,
296+ nargs = "+" ,
297+ help = "List of JSON files. First is baseline, others are compared against it." ,
211298 )
212- parser .add_argument ("baseline" , help = "Path to the baseline JSON file" )
213- parser .add_argument ("new" , help = "Path to the new JSON file" )
214299 args = parser .parse_args ()
215300
216- compare_benchmarks (args .baseline , args . new )
301+ compare_benchmarks (args .files )
0 commit comments