Skip to content

Commit e5c0f59

Browse files
yhyang201mickqian
andauthored
[diffusion] CI: send nightly-test outputs of diffusion to slack for correctness monitoring (#13833)
Co-authored-by: Mick <[email protected]>
1 parent 981ca83 commit e5c0f59

File tree

3 files changed

+292
-0
lines changed

3 files changed

+292
-0
lines changed

.github/workflows/nightly-test-nvidia.yml

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,68 @@ jobs:
193193
run: |
194194
python3 scripts/ci/publish_traces.py --traces-dir test/srt/performance_profiles_vlms
195195
196+
# diffusion performance tests
197+
nightly-test-multimodal-server-1-gpu:
198+
if: github.repository == 'sgl-project/sglang'
199+
runs-on: 1-gpu-runner
200+
strategy:
201+
fail-fast: false
202+
max-parallel: 5
203+
matrix:
204+
part: [0, 1]
205+
steps:
206+
- name: Checkout code
207+
uses: actions/checkout@v4
208+
209+
- name: Install dependencies
210+
run: |
211+
bash scripts/ci/ci_install_dependency.sh diffusion
212+
pip install slack_sdk
213+
214+
- name: Run diffusion server tests
215+
env:
216+
SGLANG_DIFFUSION_SLACK_TOKEN: ${{ secrets.SGLANG_DIFFUSION_SLACK_TOKEN }}
217+
GITHUB_RUN_ID: ${{ github.run_id }}
218+
219+
timeout-minutes: 60
220+
run: |
221+
cd python
222+
python3 sglang/multimodal_gen/test/run_suite.py \
223+
--suite 1-gpu \
224+
--partition-id ${{ matrix.part }} \
225+
--total-partitions 2
226+
227+
228+
nightly-test-multimodal-server-2-gpu:
229+
if: github.repository == 'sgl-project/sglang'
230+
runs-on: 2-gpu-runner
231+
strategy:
232+
fail-fast: false
233+
max-parallel: 5
234+
matrix:
235+
part: [0, 1]
236+
steps:
237+
- name: Checkout code
238+
uses: actions/checkout@v4
239+
240+
- name: Install dependencies
241+
run: |
242+
bash scripts/ci/ci_install_dependency.sh diffusion
243+
pip install slack_sdk
244+
245+
- name: Run diffusion server tests
246+
env:
247+
SGLANG_DIFFUSION_SLACK_TOKEN: ${{ secrets.SGLANG_DIFFUSION_SLACK_TOKEN }}
248+
GITHUB_RUN_ID: ${{ github.run_id }}
249+
250+
timeout-minutes: 60
251+
run: |
252+
cd python
253+
python3 sglang/multimodal_gen/test/run_suite.py \
254+
--suite 2-gpu \
255+
--partition-id ${{ matrix.part }} \
256+
--total-partitions 2
257+
196258
# B200 Performance tests - 4 GPU
197259
nightly-test-perf-4-gpu-b200:
198260
if: github.repository == 'sgl-project/sglang'
@@ -275,6 +337,8 @@ jobs:
275337
- nightly-test-text-perf-2-gpu-runner
276338
- nightly-test-vlm-accuracy-2-gpu-runner
277339
- nightly-test-vlm-perf-2-gpu-runner
340+
- nightly-test-multimodal-server-1-gpu
341+
- nightly-test-multimodal-server-2-gpu
278342
- nightly-test-perf-4-gpu-b200
279343
- nightly-test-perf-8-gpu-b200
280344
runs-on: ubuntu-latest

python/sglang/multimodal_gen/test/server/test_server_common.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from __future__ import annotations
99

10+
import base64
1011
import os
1112
import time
1213
from pathlib import Path
@@ -32,6 +33,7 @@
3233
PerformanceSummary,
3334
ScenarioConfig,
3435
)
36+
from sglang.multimodal_gen.test.slack_utils import upload_file_to_slack
3537
from sglang.multimodal_gen.test.test_utils import (
3638
get_dynamic_server_port,
3739
read_perf_logs,
@@ -225,6 +227,19 @@ def _create_and_download_video(
225227
resp = client.videos.download_content(video_id=video_id) # type: ignore[attr-defined]
226228
content = resp.read()
227229
validate_openai_video(content)
230+
231+
tmp_path = f"{video_id}.mp4"
232+
with open(tmp_path, "wb") as f:
233+
f.write(content)
234+
upload_file_to_slack(
235+
case_id=case.id,
236+
model=case.model_path,
237+
prompt=case.prompt,
238+
file_path=tmp_path,
239+
origin_file_path=case.image_path,
240+
)
241+
os.remove(tmp_path)
242+
228243
return video_id
229244

230245
# for all tests, seconds = case.seconds or fallback 4 seconds
@@ -248,6 +263,19 @@ def generate_image() -> str:
248263
)
249264
result = response.parse()
250265
validate_image(result.data[0].b64_json)
266+
267+
img_data = base64.b64decode(result.data[0].b64_json)
268+
tmp_path = f"{result.created}.png"
269+
with open(tmp_path, "wb") as f:
270+
f.write(img_data)
271+
upload_file_to_slack(
272+
case_id=case.id,
273+
model=case.model_path,
274+
prompt=case.prompt,
275+
file_path=tmp_path,
276+
)
277+
os.remove(tmp_path)
278+
251279
return str(result.created)
252280

253281
def generate_image_edit() -> str:
@@ -276,6 +304,20 @@ def generate_image_edit() -> str:
276304

277305
result = response.parse()
278306
validate_image(result.data[0].b64_json)
307+
308+
img_data = base64.b64decode(result.data[0].b64_json)
309+
tmp_path = f"{rid}.png"
310+
with open(tmp_path, "wb") as f:
311+
f.write(img_data)
312+
upload_file_to_slack(
313+
case_id=case.id,
314+
model=case.model_path,
315+
prompt=case.edit_prompt,
316+
file_path=tmp_path,
317+
origin_file_path=case.image_path,
318+
)
319+
os.remove(tmp_path)
320+
279321
return rid
280322

281323
# -------------------------
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
"""
2+
This file upload the media generated in diffusion-nightly-test to a slack channel of SGLang
3+
"""
4+
5+
import logging
6+
import os
7+
import tempfile
8+
from datetime import datetime
9+
from urllib.parse import urlparse
10+
from urllib.request import urlopen
11+
12+
logging.basicConfig(level=logging.INFO)
13+
logger = logging.getLogger(__name__)
14+
15+
import inspect
16+
17+
try:
18+
import sglang.multimodal_gen.test.server.testcase_configs as configs
19+
from sglang.multimodal_gen.test.server.testcase_configs import DiffusionTestCase
20+
21+
ALL_CASES = []
22+
for name, value in inspect.getmembers(configs):
23+
if name.endswith("_CASES") or "_CASES_" in name:
24+
if (
25+
isinstance(value, list)
26+
and len(value) > 0
27+
and isinstance(value[0], DiffusionTestCase)
28+
):
29+
ALL_CASES.extend(value)
30+
elif isinstance(value, list) and len(value) == 0:
31+
# Assume empty list with matching name is a valid case list container
32+
pass
33+
34+
# Deduplicate cases by ID
35+
seen_ids = set()
36+
unique_cases = []
37+
for c in ALL_CASES:
38+
if c.id not in seen_ids:
39+
seen_ids.add(c.id)
40+
unique_cases.append(c)
41+
ALL_CASES = unique_cases
42+
43+
except Exception as e:
44+
logger.warning(f"Failed to import test cases: {e}")
45+
ALL_CASES = []
46+
47+
48+
def _get_status_message(run_id, current_case_id, thread_messages=None):
49+
date_str = datetime.now().strftime("%d/%m")
50+
base_header = f"*🧵 for nightly test of {date_str}*\n*GitHub Run ID:* {run_id}\n*Total Tasks:* {len(ALL_CASES)}"
51+
52+
if not ALL_CASES:
53+
return base_header
54+
55+
default_emoji_for_case_in_progress = "⏳"
56+
status_map = {c.id: default_emoji_for_case_in_progress for c in ALL_CASES}
57+
58+
if thread_messages:
59+
for msg in thread_messages:
60+
text = msg.get("text", "")
61+
# Look for case_id in the message (format: *Case ID:* `case_id`)
62+
for c in ALL_CASES:
63+
if f"*Case ID:* `{c.id}`" in text:
64+
status_map[c.id] = "✅"
65+
66+
if current_case_id:
67+
status_map[current_case_id] = "✅"
68+
69+
lines = [base_header, "", "*Tasks Status:*"]
70+
71+
# Calculate padding
72+
max_len = max(len(c.id) for c in ALL_CASES) if ALL_CASES else 10
73+
max_len = max(max_len, len("Case ID"))
74+
75+
# Build markdown table inside a code block
76+
table_lines = ["```"]
77+
table_lines.append(f"| {'Case ID'.ljust(max_len)} | Status |")
78+
table_lines.append(f"| {'-' * max_len} | :----: |")
79+
80+
for c in ALL_CASES:
81+
mark = status_map.get(c.id, default_emoji_for_case_in_progress)
82+
table_lines.append(f"| {c.id.ljust(max_len)} | {mark} |")
83+
84+
table_lines.append("```")
85+
86+
lines.extend(table_lines)
87+
88+
return "\n".join(lines)
89+
90+
91+
def upload_file_to_slack(
92+
case_id: str = None,
93+
model: str = None,
94+
prompt: str = None,
95+
file_path: str = None,
96+
origin_file_path: str = None,
97+
) -> bool:
98+
temp_path = None
99+
try:
100+
from slack_sdk import WebClient
101+
102+
run_id = os.getenv("GITHUB_RUN_ID", "local")
103+
104+
token = os.environ.get("SGLANG_DIFFUSION_SLACK_TOKEN")
105+
if not token:
106+
logger.info(f"Slack upload failed: no token")
107+
return False
108+
109+
if not file_path or not os.path.exists(file_path):
110+
logger.info(f"Slack upload failed: no file path")
111+
return False
112+
113+
if origin_file_path and origin_file_path.startswith(("http", "https")):
114+
suffix = os.path.splitext(urlparse(origin_file_path).path)[1] or ".tmp"
115+
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tf:
116+
with urlopen(origin_file_path) as response:
117+
tf.write(response.read())
118+
temp_path = tf.name
119+
origin_file_path = temp_path
120+
121+
uploads = [{"file": file_path, "title": "Generated Image"}]
122+
if origin_file_path and os.path.exists(origin_file_path):
123+
uploads.insert(0, {"file": origin_file_path, "title": "Original Image"})
124+
125+
message = (
126+
f"*Case ID:* `{case_id}`\n" f"*Model:* `{model}`\n" f"*Prompt:* {prompt}"
127+
)
128+
129+
client = WebClient(token=token)
130+
channel_id = "C0A02NDF7UY"
131+
thread_ts = None
132+
133+
parent_msg_text = None
134+
try:
135+
history = client.conversations_history(channel=channel_id, limit=100)
136+
for msg in history.get("messages", []):
137+
if f"*GitHub Run ID:* {run_id}" in msg.get("text", ""):
138+
# Use thread_ts if it exists (msg is a reply), otherwise use ts (msg is a parent)
139+
thread_ts = msg.get("thread_ts") or msg.get("ts")
140+
parent_msg_text = msg.get("text", "")
141+
logger.info(f"Found thread_ts: {thread_ts}")
142+
break
143+
except Exception as e:
144+
logger.warning(f"Failed to search slack history: {e}")
145+
146+
if not thread_ts:
147+
try:
148+
text = _get_status_message(run_id, case_id)
149+
response = client.chat_postMessage(channel=channel_id, text=text)
150+
thread_ts = response["ts"]
151+
except Exception as e:
152+
logger.warning(f"Failed to create parent thread: {e}")
153+
154+
# Upload first to ensure it's in history
155+
client.files_upload_v2(
156+
channel=channel_id,
157+
file_uploads=uploads,
158+
initial_comment=message,
159+
thread_ts=thread_ts,
160+
)
161+
162+
# Then update status based on thread replies
163+
if thread_ts:
164+
try:
165+
replies = client.conversations_replies(
166+
channel=channel_id, ts=thread_ts, limit=200
167+
)
168+
messages = replies.get("messages", [])
169+
new_text = _get_status_message(run_id, case_id, messages)
170+
171+
# Only update if changed significantly (ignoring timestamp diffs if any)
172+
# But here we just check text content
173+
if new_text != parent_msg_text:
174+
client.chat_update(channel=channel_id, ts=thread_ts, text=new_text)
175+
except Exception as e:
176+
logger.warning(f"Failed to update parent message: {e}")
177+
178+
logger.info(f"File uploaded successfully: {os.path.basename(file_path)}")
179+
return True
180+
181+
except Exception as e:
182+
logger.info(f"Slack upload failed: {e}")
183+
return False
184+
finally:
185+
if temp_path and os.path.exists(temp_path):
186+
os.remove(temp_path)

0 commit comments

Comments
 (0)