diff --git a/.github/workflows/ci-monitor.yml b/.github/workflows/ci-monitor.yml index 89d7ec877960..4ec34585d543 100644 --- a/.github/workflows/ci-monitor.yml +++ b/.github/workflows/ci-monitor.yml @@ -2,7 +2,8 @@ name: CI Monitor on: schedule: - - cron: '0 */12 * * *' + - cron: '0 */12 * * *' # Every 12 hours for main analysis + - cron: '0 6 * * *' # Daily at 6:00 AM UTC for balance analysis workflow_dispatch: inputs: limit: @@ -63,3 +64,38 @@ jobs: scripts/ci_monitor/ci_analysis_*.json scripts/ci_monitor/performance_tables_* retention-days: 30 + + ci-monitor-balance: + if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.9' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install requests + + - name: Run Test Balance Analysis + env: + GITHUB_TOKEN: ${{ secrets.GH_PAT_FOR_NIGHTLY_CI }} + PYTHONUNBUFFERED: 1 + PYTHONIOENCODING: utf-8 + run: | + cd scripts/ci_monitor + python ci_analyzer_balance.py --token $GITHUB_TOKEN --limit ${{ inputs.limit || '1000' }} --output test_balance_report_$(date +%Y%m%d_%H%M%S).json + + - name: Upload Balance Analysis Results + uses: actions/upload-artifact@v4 + with: + name: test-balance-results-${{ github.run_number }} + path: | + scripts/ci_monitor/test_balance_report_*.json + scripts/ci_monitor/test_balance_report_*.csv + retention-days: 30 diff --git a/scripts/ci_monitor/README.md b/scripts/ci_monitor/README.md index 4e87f7ab140b..c77e60786134 100644 --- a/scripts/ci_monitor/README.md +++ b/scripts/ci_monitor/README.md @@ -2,10 +2,11 @@ > **Note**: This README.md is primarily generated by Claude 4 with some manual adjustments. -A comprehensive toolkit to analyze CI failures and performance trends for the SGLang project. This toolkit includes two main tools: +A comprehensive toolkit to analyze CI failures and performance trends for the SGLang project. This toolkit includes three main tools: 1. **CI Analyzer** (`ci_analyzer.py`): Analyzes CI failures and provides detailed failure pattern analysis 2. **Performance Analyzer** (`ci_analyzer_perf.py`): Tracks performance metrics over time and generates trend charts +3. **Test Balance Analyzer** (`ci_analyzer_balance.py`): Analyzes test time gaps between elapsed and estimated times to help balance CI ## Features @@ -26,6 +27,15 @@ A comprehensive toolkit to analyze CI failures and performance trends for the SG - **Comprehensive Metrics**: Track output throughput, E2E latency, TTFT, accept length, and more - **Time-Based Sampling**: Intelligent sampling strategy to cover extended time periods (up to 30 days) with limited API calls +### Test Balance Analyzer (`ci_analyzer_balance.py`) +- **Time Gap Analysis**: Identify GPU tests with large gaps between elapsed and estimated times +- **CI Balancing**: Help optimize CI by identifying tests that need time adjustments +- **Gap Tracking**: Track maximum time gaps for each test across multiple CI runs +- **PR Test Focus**: Only analyzes GPU jobs from pr-test.yml workflow (excludes AMD and other workflows) +- **Ranking System**: Sort tests by time gap severity to prioritize adjustments +- **CSV Export**: Export analysis results in CSV format for easy review +- **GitHub Integration**: Generate GitHub Actions summaries with recommendations + ### Common Features - **Automated Monitoring**: GitHub Actions workflow for continuous CI and performance monitoring @@ -45,6 +55,13 @@ Additional dependencies required for chart generation: pip install requests matplotlib pandas ``` +### For Test Balance Analyzer +No additional dependencies required beyond Python standard library and `requests`: + +```bash +pip install requests +``` + ## Usage ### CI Analyzer @@ -97,6 +114,25 @@ python ci_analyzer_perf.py --token YOUR_GITHUB_TOKEN --start-date $(date -d '7 d python ci_analyzer_perf.py --token YOUR_GITHUB_TOKEN --limit 1000 --upload-to-github ``` +### Test Balance Analyzer + +#### Basic Usage + +```bash +# Analyze PR Test GPU job time gaps from recent CI runs +python ci_analyzer_balance.py --token YOUR_GITHUB_TOKEN +``` + +#### Advanced Usage + +```bash +# Analyze last 1000 PR Test GPU CI runs for comprehensive test balance analysis +python ci_analyzer_balance.py --token YOUR_GITHUB_TOKEN --limit 1000 + +# Custom output file +python ci_analyzer_balance.py --token YOUR_GITHUB_TOKEN --limit 500 --output my_balance_analysis.json +``` + **Important**: Make sure your GitHub token has `repo` and `workflow` permissions, otherwise you'll get 404 errors. ## Data Collection Strategies @@ -183,6 +219,14 @@ Use `--start-date` and `--end-date` parameters to get **ALL** CI runs within a s | `--end-date` | None | End date for date range query (YYYY-MM-DD format) | | `--upload-to-github` | False | Upload results to sglang-bot/sglang-ci-data repository | +### Test Balance Analyzer Parameters + +| Parameter | Default | Description | +|-----------|---------|-------------| +| `--token` | Required | GitHub Personal Access Token | +| `--limit` | 1000 | Number of CI runs to analyze | +| `--output` | test_balance_report.json | Output JSON file for detailed analysis data | + ## Getting GitHub Token 1. Go to [GitHub Settings > Personal Access Tokens](https://github.com/settings/tokens) diff --git a/scripts/ci_monitor/ci_analyzer.py b/scripts/ci_monitor/ci_analyzer.py index 922f3ef442e8..47231f50fbe8 100755 --- a/scripts/ci_monitor/ci_analyzer.py +++ b/scripts/ci_monitor/ci_analyzer.py @@ -1,8 +1,4 @@ #!/usr/bin/env python3 -""" -SGLang CI Analyzer -Simple tool to analyze CI failures for SGLang project -""" import argparse import json @@ -17,7 +13,6 @@ class SGLangCIAnalyzer: - """SGLang CI Analyzer""" def __init__(self, token: str): self.token = token @@ -32,7 +27,6 @@ def __init__(self, token: str): self.session.headers.update(self.headers) def get_recent_runs(self, limit: int = 100, branch: str = None) -> List[Dict]: - """Get recent CI run data""" branch_info = f" from branch '{branch}'" if branch else "" print(f"Fetching {limit} recent CI runs{branch_info}...") @@ -61,7 +55,7 @@ def get_recent_runs(self, limit: int = 100, branch: str = None) -> List[Dict]: break page += 1 - time.sleep(0.1) # Avoid API rate limits + time.sleep(0.1) except requests.exceptions.RequestException as e: print(f"Error fetching CI data: {e}") @@ -70,10 +64,10 @@ def get_recent_runs(self, limit: int = 100, branch: str = None) -> List[Dict]: return all_runs[:limit] def analyze_ci_failures(self, runs: List[Dict]) -> Dict: - """Analyze CI failure patterns (CUDA jobs only)""" - print("Analyzing CI failure data (CUDA only)...") + print( + "Analyzing CI failure data (pr-test.yml, vllm-dependency-test.yml, nightly-test.yml jobs only)..." + ) - # SGLang specific job categories (CUDA only) job_categories = { "build": [ "build-test", @@ -137,7 +131,6 @@ def analyze_ci_failures(self, runs: List[Dict]) -> Dict: total_runs = len(runs) for i, run in enumerate(runs, 1): - # Show progress every 10% or every 50 runs, whichever is smaller if i % max(1, min(50, total_runs // 10)) == 0 or i == total_runs: progress = (i / total_runs) * 100 print(f"Progress: {i}/{total_runs} ({progress:.1f}%)") @@ -148,7 +141,6 @@ def analyze_ci_failures(self, runs: List[Dict]) -> Dict: run_number = run.get("run_number") created_at = run.get("created_at") - # Count run status if run_status == "failure": stats["failed_runs"] += 1 elif run_status == "success": @@ -158,7 +150,6 @@ def analyze_ci_failures(self, runs: List[Dict]) -> Dict: elif run_status == "skipped": stats["skipped_runs"] += 1 - # Get detailed job information for all runs jobs = self._get_job_details(run_id) run_url = f"https://github.com/{self.repo}/actions/runs/{run_id}" pr_info = self._get_pr_info(run) @@ -167,27 +158,37 @@ def analyze_ci_failures(self, runs: List[Dict]) -> Dict: job_name = job.get("name", "Unknown") job_conclusion = job.get("conclusion", "unknown") - # Filter out non-specific CI jobs and non-CUDA jobs - # Skip meta jobs and AMD/NPU related jobs - if ( - job_name - not in [ - "check-changes", - "pr-test-finish", - "pr-test-h20-finish", - "pr-test-amd-finish", - "pr-test-b200-finish", - "lint", - "Set up job", - ] - and "-amd" not in job_name.lower() - and "mi300" not in job_name.lower() - and "mi325" not in job_name.lower() - and "gfx" not in job_name.lower() - and "-npu" not in job_name.lower() - and "ascend" not in job_name.lower() - ): - # Record successful jobs (update last success) + target_jobs = [ + "check-changes", + "sgl-kernel-build-wheels", + "sgl-kernel-unit-test", + "sgl-kernel-mla-test", + "sgl-kernel-benchmark-test", + "unit-test-frontend", + "unit-test-backend-1-gpu", + "unit-test-backend-2-gpu", + "unit-test-backend-4-gpu", + "unit-test-backend-8-gpu-h200", + "unit-test-backend-8-gpu-h20", + "performance-test-1-gpu-part-1", + "performance-test-1-gpu-part-2", + "performance-test-1-gpu-part-3", + "performance-test-2-gpu", + "accuracy-test-1-gpu", + "accuracy-test-2-gpu", + "unit-test-deepep-4-gpu", + "unit-test-deepep-8-gpu", + "unit-test-backend-8-gpu-deepseek-v32", + "unit-test-backend-4-gpu-b200", + "vllm-dependency-test", + "nightly-test-eval-text-models", + "nightly-test-perf-text-models", + "nightly-test-eval-vlms", + "nightly-test-perf-vlms", + "nightly-test-1-gpu", + ] + + if job_name in target_jobs: if job_conclusion == "success": stats["job_last_success"][job_name] = { "url": run_url, @@ -196,11 +197,9 @@ def analyze_ci_failures(self, runs: List[Dict]) -> Dict: "pr_info": pr_info, } - # Record failed jobs elif job_conclusion == "failure": stats["job_failures"][job_name] += 1 - # Store failure link (keep only last 3 for each job) if len(stats["job_failure_links"][job_name]) < 3: stats["job_failure_links"][job_name].append( { @@ -211,7 +210,6 @@ def analyze_ci_failures(self, runs: List[Dict]) -> Dict: } ) - # Categorize failed jobs for category, jobs_list in job_categories.items(): if any( job_pattern in job_name for job_pattern in jobs_list @@ -219,15 +217,13 @@ def analyze_ci_failures(self, runs: List[Dict]) -> Dict: stats["category_failures"][category] += 1 break - # Analyze failure patterns self._analyze_failure_pattern(job, stats) - time.sleep(0.1) # Avoid API rate limits + time.sleep(0.1) return stats def _get_job_details(self, run_id: int) -> List[Dict]: - """Get job details for a specific run""" url = f"{self.base_url}/repos/{self.repo}/actions/runs/{run_id}/jobs" try: response = self.session.get(url) @@ -237,7 +233,6 @@ def _get_job_details(self, run_id: int) -> List[Dict]: return [] def _get_pr_info(self, run: Dict) -> Dict: - """Get PR information from a run""" pr_info = { "pr_number": None, "author": run.get("head_commit", {}) @@ -247,7 +242,6 @@ def _get_pr_info(self, run: Dict) -> Dict: "head_branch": run.get("head_branch", ""), } - # Try to extract PR number from pull_requests pull_requests = run.get("pull_requests", []) if pull_requests: pr_info["pr_number"] = pull_requests[0].get("number") @@ -255,7 +249,6 @@ def _get_pr_info(self, run: Dict) -> Dict: return pr_info def _analyze_failure_pattern(self, job: Dict, stats: Dict): - """Analyze failure patterns (CUDA jobs only)""" job_name = job.get("name", "") steps = job.get("steps", []) @@ -263,7 +256,6 @@ def _analyze_failure_pattern(self, job: Dict, stats: Dict): if step.get("conclusion") == "failure": step_name = step.get("name", "") - # SGLang specific failure pattern recognition (CUDA only) if "timeout" in step_name.lower(): stats["failure_patterns"]["Timeout"] += 1 elif "build" in step_name.lower() or "build" in job_name.lower(): @@ -296,12 +288,10 @@ def _analyze_failure_pattern(self, job: Dict, stats: Dict): stats["failure_patterns"]["Other"] += 1 def generate_report(self, stats: Dict): - """Generate CI analysis report""" print("\n" + "=" * 60) - print("SGLang CI Analysis Report (CUDA Only)") + print("SGLang CI Analysis Report (Target Workflows Only)") print("=" * 60) - # Overall statistics total = stats["total_runs"] failed = stats["failed_runs"] success = stats["successful_runs"] @@ -317,7 +307,6 @@ def generate_report(self, stats: Dict): print(f" Skipped: {skipped}") print(f" Success rate: {success_rate:.1f}%") - # Category failure statistics if stats["category_failures"]: print(f"\nCategory Failure Statistics:") for category, count in sorted( @@ -325,7 +314,6 @@ def generate_report(self, stats: Dict): ): print(f" {category}: {count} failures") - # Most frequently failed jobs with links if stats["job_failures"]: print(f"\nMost Frequently Failed Jobs (Top 50):") for i, (job, count) in enumerate( @@ -336,7 +324,6 @@ def generate_report(self, stats: Dict): ): print(f" {i:2d}. {job}: {count} times") - # Show last successful run if job in stats["job_last_success"]: last_success = stats["job_last_success"][job] success_date = datetime.fromisoformat( @@ -356,7 +343,6 @@ def generate_report(self, stats: Dict): f" Last Success: Run #{last_success['run_number']} ({success_date.strftime('%Y-%m-%d %H:%M')}){pr_text}: {last_success['url']}" ) - # Show recent failure links if ( job in stats["job_failure_links"] and stats["job_failure_links"][job] @@ -367,7 +353,6 @@ def generate_report(self, stats: Dict): link_info["created_at"].replace("Z", "+00:00") ) - # Format PR info for failures pr_info = link_info.get("pr_info", {}) pr_text = "" if pr_info.get("pr_number"): @@ -379,7 +364,6 @@ def generate_report(self, stats: Dict): f" - Run #{link_info['run_number']} ({created_at.strftime('%Y-%m-%d %H:%M')}){pr_text}: {link_info['url']}" ) - # Failure pattern analysis if stats["failure_patterns"]: print(f"\nFailure Pattern Analysis:") for pattern, count in sorted( @@ -390,13 +374,11 @@ def generate_report(self, stats: Dict): print("\n" + "=" * 60) def save_detailed_report(self, stats: Dict, output_file: str = "ci_analysis.json"): - """Save detailed report to file""" with open(output_file, "w", encoding="utf-8") as f: json.dump(stats, f, ensure_ascii=False, indent=2) print(f"\nDetailed report saved to: {output_file}") def generate_github_summary(self, stats: Dict): - """Generate GitHub Actions summary""" try: github_step_summary = os.environ.get("GITHUB_STEP_SUMMARY") if not github_step_summary: @@ -406,10 +388,11 @@ def generate_github_summary(self, stats: Dict): print("📊 Generating GitHub Actions summary for CI Analysis...") summary_lines = [] - summary_lines.append("# 🔍 SGLang CI Analysis Report (CUDA Only)") + summary_lines.append( + "# 🔍 SGLang CI Analysis Report (Target Workflows Only)" + ) summary_lines.append("") - # Overall statistics total = stats["total_runs"] failed = stats["failed_runs"] success = stats["successful_runs"] @@ -435,7 +418,6 @@ def generate_github_summary(self, stats: Dict): summary_lines.append(f"| **Success Rate** | **{success_rate:.1f}%** | - |") summary_lines.append("") - # Category failure statistics if stats["category_failures"]: summary_lines.append("## 📁 Category Failure Statistics") summary_lines.append("") @@ -447,7 +429,6 @@ def generate_github_summary(self, stats: Dict): summary_lines.append(f"| {category} | {count} |") summary_lines.append("") - # Most frequently failed jobs (Top 20) if stats["job_failures"]: summary_lines.append("## 🔴 Most Frequently Failed Jobs (Top 20)") summary_lines.append("") @@ -460,7 +441,6 @@ def generate_github_summary(self, stats: Dict): summary_lines.append(f"### {i}. `{job}` ({count} failures)") summary_lines.append("") - # Show last successful run if job in stats["job_last_success"]: last_success = stats["job_last_success"][job] success_date = datetime.fromisoformat( @@ -481,7 +461,6 @@ def generate_github_summary(self, stats: Dict): ) summary_lines.append("") - # Show recent failure links if ( job in stats["job_failure_links"] and stats["job_failure_links"][job] @@ -504,7 +483,6 @@ def generate_github_summary(self, stats: Dict): ) summary_lines.append("") - # Failure pattern analysis if stats["failure_patterns"]: summary_lines.append("## 🔬 Failure Pattern Analysis") summary_lines.append("") @@ -516,10 +494,9 @@ def generate_github_summary(self, stats: Dict): summary_lines.append(f"| {pattern} | {count} |") summary_lines.append("") - # Write summary to GitHub Actions with open(github_step_summary, "w", encoding="utf-8") as f: f.write("\n".join(summary_lines)) - f.write("\n\n---\n\n") # Add separator between reports + f.write("\n\n---\n\n") print("✅ GitHub Actions summary generated successfully") @@ -549,12 +526,9 @@ def main(): args = parser.parse_args() - # Create analyzer analyzer = SGLangCIAnalyzer(args.token) try: - # Get CI run data - # Use None for branch if empty string is provided (to scan all branches) branch = args.branch if args.branch else None runs = analyzer.get_recent_runs(args.limit, branch) @@ -562,16 +536,12 @@ def main(): print("No CI run data found") return - # Analyze failures stats = analyzer.analyze_ci_failures(runs) - # Generate report analyzer.generate_report(stats) - # Save detailed report analyzer.save_detailed_report(stats, args.output) - # Generate GitHub summary analyzer.generate_github_summary(stats) except Exception as e: diff --git a/scripts/ci_monitor/ci_analyzer_balance.py b/scripts/ci_monitor/ci_analyzer_balance.py new file mode 100755 index 000000000000..c3a92de09504 --- /dev/null +++ b/scripts/ci_monitor/ci_analyzer_balance.py @@ -0,0 +1,532 @@ +import argparse +import json +import os +import re +import sys +import time +from collections import defaultdict +from datetime import datetime +from typing import Dict, List, Optional, Tuple + +import requests + + +class SGLangTestBalanceAnalyzer: + + def __init__(self, token: str): + self.token = token + self.base_url = "https://api.github.com" + self.repo = "sgl-project/sglang" + self.headers = { + "Authorization": f"token {token}", + "Accept": "application/vnd.github.v3+json", + "User-Agent": "SGLang-Test-Balance-Analyzer/1.0", + } + self.session = requests.Session() + self.session.headers.update(self.headers) + + self.test_time_pattern = re.compile( + r"filename='([^']+)',\s*elapsed=(\d+),\s*estimated_time=(\d+)" + ) + + def get_recent_runs(self, limit: int = 1000) -> List[Dict]: + print(f"Fetching {limit} recent CI runs...") + + all_runs = [] + page = 1 + per_page = 100 + + while len(all_runs) < limit: + url = f"{self.base_url}/repos/{self.repo}/actions/runs" + params = {"per_page": min(per_page, limit - len(all_runs)), "page": page} + + try: + response = self.session.get(url, params=params) + response.raise_for_status() + data = response.json() + + if not data.get("workflow_runs"): + break + + all_runs.extend(data["workflow_runs"]) + print(f"Fetched {len(all_runs)} runs so far...") + + if len(data["workflow_runs"]) < per_page: + break + + page += 1 + time.sleep(0.1) + + except requests.exceptions.RequestException as e: + print(f"Error fetching CI data: {e}") + break + + return all_runs[:limit] + + def get_job_logs(self, run_id: int, job_name: str) -> Optional[str]: + try: + jobs_url = f"{self.base_url}/repos/{self.repo}/actions/runs/{run_id}/jobs" + response = self.session.get(jobs_url) + response.raise_for_status() + jobs_data = response.json() + + target_job = None + for job in jobs_data.get("jobs", []): + if job.get("name", "") == job_name: + target_job = job + break + + if not target_job: + return None + + logs_url = f"{self.base_url}/repos/{self.repo}/actions/jobs/{target_job['id']}/logs" + response = self.session.get(logs_url) + response.raise_for_status() + + return response.text + + except Exception as e: + if "404" not in str(e): + print(f"Failed to get job {job_name} logs: {e}") + return None + + def get_all_jobs_for_run(self, run_id: int) -> List[Dict]: + try: + jobs_url = f"{self.base_url}/repos/{self.repo}/actions/runs/{run_id}/jobs" + response = self.session.get(jobs_url) + response.raise_for_status() + jobs_data = response.json() + return jobs_data.get("jobs", []) + except Exception as e: + print(f"Failed to get jobs for run {run_id}: {e}") + return [] + + def get_job_logs_by_id(self, job_id: int) -> Optional[str]: + try: + logs_url = f"{self.base_url}/repos/{self.repo}/actions/jobs/{job_id}/logs" + response = self.session.get(logs_url) + response.raise_for_status() + return response.text + except Exception as e: + if "404" not in str(e): + print(f"Failed to get job {job_id} logs: {e}") + return None + + def parse_test_times(self, log_content: str) -> List[Dict]: + if not log_content: + return [] + + test_times = [] + matches = self.test_time_pattern.findall(log_content) + filtered_count = 0 + + for match in matches: + filename, elapsed_str, estimated_str = match + try: + elapsed = int(elapsed_str) + estimated = int(estimated_str) + gap = elapsed - estimated + + if self._is_abnormal_test_data( + elapsed, estimated, log_content, filename + ): + filtered_count += 1 + continue + + test_times.append( + { + "filename": filename, + "elapsed": elapsed, + "estimated": estimated, + "gap": gap, + } + ) + except ValueError: + continue + + return test_times + + def _is_abnormal_test_data( + self, elapsed: int, estimated: int, log_content: str, filename: str + ) -> bool: + + # To avoid collect retry data + if elapsed % estimated == 0: + return True + + return False + + def collect_test_balance_data(self, runs: List[Dict]) -> Dict[str, Dict]: + print("Starting test balance data collection...") + + test_gaps = defaultdict( + lambda: { + "max_gap": 0, + "max_elapsed": 0, + "max_estimated": 0, + "max_gap_run_info": {}, + "total_runs": 0, + "all_gaps": [], + } + ) + + total_tests_parsed = 0 + abnormal_tests_filtered = 0 + + target_job_prefixes = [ + "unit-test-frontend", + "unit-test-backend-1-gpu", + "unit-test-backend-2-gpu", + "unit-test-backend-4-gpu", + "unit-test-backend-8-gpu-h200", + "unit-test-backend-8-gpu-h20", + "unit-test-backend-4-gpu-b200", + "unit-test-deepep-4-gpu", + "unit-test-deepep-8-gpu", + "unit-test-backend-8-gpu-deepseek-v32", + "performance-test-1-gpu-part-1", + "performance-test-1-gpu-part-2", + "performance-test-1-gpu-part-3", + "performance-test-2-gpu", + "accuracy-test-1-gpu", + "accuracy-test-2-gpu", + ] + + total_runs = len(runs) + for i, run in enumerate(runs, 1): + if i % 10 == 0 or i == total_runs: + print(f"Processing run {i}/{total_runs}: #{run.get('run_number')}") + + workflow_name = run.get("name", "") + if "AMD" in workflow_name or "amd" in workflow_name.lower(): + continue + + run_info = { + "run_number": run.get("run_number"), + "created_at": run.get("created_at"), + "head_sha": run.get("head_sha", "")[:8], + "author": run.get("head_commit", {}) + .get("author", {}) + .get("name", "Unknown"), + "url": f"https://github.com/{self.repo}/actions/runs/{run.get('id')}", + } + + pull_requests = run.get("pull_requests", []) + if pull_requests: + run_info["pr_number"] = pull_requests[0].get("number") + + all_jobs = self.get_all_jobs_for_run(run.get("id")) + + for job in all_jobs: + job_name = job.get("name", "") + job_id = job.get("id") + + matches_prefix = False + for prefix in target_job_prefixes: + if job_name.startswith(prefix): + matches_prefix = True + break + + if not matches_prefix: + continue + + logs = self.get_job_logs_by_id(job_id) + if not logs: + continue + + test_times = self.parse_test_times(logs) + total_tests_parsed += len(test_times) + + for test_data in test_times: + filename = test_data["filename"] + elapsed = test_data["elapsed"] + estimated = test_data["estimated"] + gap = test_data["gap"] + + test_stats = test_gaps[filename] + test_stats["total_runs"] += 1 + test_stats["all_gaps"].append(gap) + + if gap > test_stats["max_gap"]: + test_stats["max_gap"] = gap + test_stats["max_elapsed"] = elapsed + test_stats["max_estimated"] = estimated + test_stats["max_gap_run_info"] = { + **run_info, + "job_name": job_name, + "job_url": f"https://github.com/{self.repo}/actions/runs/{run.get('id')}/job/{job_id}", + } + + time.sleep(0.1) + + return dict(test_gaps) + + def generate_balance_report( + self, test_data: Dict[str, Dict], output_file: str = "test_balance_report.json" + ): + print("\n" + "=" * 80) + print("SGLang Test Balance Analysis Report (PR Test GPU Jobs)") + print("=" * 80) + + sorted_tests = sorted( + test_data.items(), key=lambda x: x[1]["max_gap"], reverse=True + ) + + print(f"\nTotal tests analyzed: {len(sorted_tests)}") + print( + f"Tests with significant gaps (>100s): {len([t for t in sorted_tests if t[1]['max_gap'] > 100])}" + ) + print( + f"Tests with large gaps (>300s): {len([t for t in sorted_tests if t[1]['max_gap'] > 300])}" + ) + print( + f"Note: Abnormal test data (due to failures/retries) has been filtered out" + ) + + report_data = { + "summary": { + "total_tests": len(sorted_tests), + "tests_with_gaps_over_100s": len( + [t for t in sorted_tests if t[1]["max_gap"] > 100] + ), + "tests_with_gaps_over_300s": len( + [t for t in sorted_tests if t[1]["max_gap"] > 300] + ), + "analysis_timestamp": datetime.now().isoformat(), + }, + "test_balance_table": [], + } + + print(f"\nTop 50 PR Test GPU Jobs with Largest Time Gaps:") + print("-" * 100) + print( + f"{'Rank':<4} {'Test File':<40} {'Max Gap':<8} {'Max Elapsed':<12} {'Max Estimated':<15} {'Job Name':<25}" + ) + print("-" * 100) + + for i, (filename, stats) in enumerate(sorted_tests[:50], 1): + test_name = filename.split("/")[-1] if "/" in filename else filename + job_name = ( + stats["max_gap_run_info"].get("job_name", "Unknown") + if stats["max_gap_run_info"] + else "Unknown" + ) + + print( + f"{i:<4} {test_name:<40} {stats['max_gap']:<8} {stats['max_elapsed']:<12} {stats['max_estimated']:<15} {job_name:<25}" + ) + + report_data["test_balance_table"].append( + { + "rank": i, + "filename": filename, + "test_name": test_name, + "max_gap": stats["max_gap"], + "max_elapsed": stats["max_elapsed"], + "max_estimated": stats["max_estimated"], + "max_gap_run_info": stats["max_gap_run_info"], + "total_runs": stats["total_runs"], + } + ) + + with open(output_file, "w", encoding="utf-8") as f: + json.dump(report_data, f, ensure_ascii=False, indent=2) + print(f"\nDetailed report saved to: {output_file}") + + return report_data + + def generate_github_summary(self, report_data: Dict): + try: + github_step_summary = os.environ.get("GITHUB_STEP_SUMMARY") + if not github_step_summary: + print("Not running in GitHub Actions, skipping summary generation") + return + + print("Generating GitHub Actions summary for Test Balance Analysis...") + + summary_lines = [] + summary_lines.append( + "# SGLang Test Balance Analysis Report (PR Test GPU Jobs)" + ) + summary_lines.append("") + summary_lines.append( + f"**Analysis Timestamp:** {report_data['summary']['analysis_timestamp']}" + ) + summary_lines.append("") + + summary_lines.append("## Summary Statistics") + summary_lines.append("") + summary_lines.append("| Metric | Count |") + summary_lines.append("|--------|-------|") + summary_lines.append( + f"| Total Tests Analyzed | {report_data['summary']['total_tests']} |" + ) + summary_lines.append( + f"| Tests with Gaps > 100s | {report_data['summary']['tests_with_gaps_over_100s']} |" + ) + summary_lines.append( + f"| Tests with Gaps > 300s | {report_data['summary']['tests_with_gaps_over_300s']} |" + ) + summary_lines.append("") + + summary_lines.append("## Top 30 PR Test GPU Jobs with Largest Time Gaps") + summary_lines.append("") + summary_lines.append( + "| Rank | Test File | Max Gap (s) | Max Elapsed (s) | Max Estimated (s) | Job Name | Job Link | Total Runs |" + ) + summary_lines.append( + "|------|-----------|-------------|----------------|------------------|---------|----------|------------|" + ) + + for test in report_data["test_balance_table"][:30]: + test_name = test["test_name"] + if len(test_name) > 30: + test_name = test_name[:27] + "..." + + job_name = ( + test["max_gap_run_info"].get("job_name", "Unknown") + if test["max_gap_run_info"] + else "Unknown" + ) + job_url = ( + test["max_gap_run_info"].get("job_url", "") + if test["max_gap_run_info"] + else "" + ) + job_link = f"[{job_name}]({job_url})" if job_url else job_name + + summary_lines.append( + f"| {test['rank']} | `{test_name}` | {test['max_gap']} | {test['max_elapsed']} | {test['max_estimated']} | {job_name} | [{job_name}]({job_url}) | {test['total_runs']} |" + ) + + summary_lines.append("") + summary_lines.append("## Recommendations") + summary_lines.append("") + summary_lines.append( + "Based on the analysis above, consider adjusting estimated times for tests with large gaps:" + ) + summary_lines.append("") + + top_5_tests = report_data["test_balance_table"][:5] + for test in top_5_tests: + test_name = test["test_name"] + if len(test_name) > 40: + test_name = test_name[:37] + "..." + suggested_estimated = test["max_elapsed"] + 50 + summary_lines.append( + f"- **{test_name}**: Current max elapsed: {test['max_elapsed']}s, suggested estimated: {suggested_estimated}s" + ) + + summary_lines.append("") + summary_lines.append( + "Set estimated times to be slightly higher than the maximum observed elapsed time to avoid CI timeouts." + ) + + with open(github_step_summary, "w", encoding="utf-8") as f: + f.write("\n".join(summary_lines)) + + print("GitHub Actions summary generated successfully") + + except Exception as e: + print(f"Failed to generate GitHub Actions summary: {e}") + + def save_csv_report( + self, report_data: Dict, output_file: str = "test_balance_report.csv" + ): + import csv + + with open(output_file, "w", encoding="utf-8", newline="") as f: + writer = csv.writer(f) + + writer.writerow( + [ + "Rank", + "Test File", + "Test Name", + "Max Gap (s)", + "Max Elapsed (s)", + "Max Estimated (s)", + "Job Name", + "Max Gap Job URL", + "Total Runs", + ] + ) + + for test in report_data["test_balance_table"]: + max_job_url = ( + test["max_gap_run_info"].get("job_url", "") + if test["max_gap_run_info"] + else "" + ) + job_name = ( + test["max_gap_run_info"].get("job_name", "Unknown") + if test["max_gap_run_info"] + else "Unknown" + ) + + writer.writerow( + [ + test["rank"], + test["filename"], + test["test_name"], + test["max_gap"], + test["max_elapsed"], + test["max_estimated"], + job_name, + max_job_url, + test["total_runs"], + ] + ) + + print(f"CSV report saved to: {output_file}") + + +def main(): + parser = argparse.ArgumentParser(description="SGLang Test Balance Analyzer") + parser.add_argument("--token", required=True, help="GitHub Personal Access Token") + parser.add_argument( + "--limit", + type=int, + default=1000, + help="Number of runs to analyze (default: 1000)", + ) + parser.add_argument( + "--output", + default="test_balance_report.json", + help="Output file (default: test_balance_report.json)", + ) + + args = parser.parse_args() + + analyzer = SGLangTestBalanceAnalyzer(args.token) + + try: + runs = analyzer.get_recent_runs(args.limit) + + if not runs: + print("No CI run data found") + return + + test_data = analyzer.collect_test_balance_data(runs) + + if not test_data: + print("No test balance data found") + return + + report_data = analyzer.generate_balance_report(test_data, args.output) + + csv_output = args.output.replace(".json", ".csv") + analyzer.save_csv_report(report_data, csv_output) + + analyzer.generate_github_summary(report_data) + + except Exception as e: + print(f"Error during analysis: {e}") + import traceback + + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/scripts/ci_monitor/example.sh b/scripts/ci_monitor/example.sh deleted file mode 100755 index abc656fce470..000000000000 --- a/scripts/ci_monitor/example.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -# Example usage of SGLang CI Analyzer - -# IMPORTANT: Get your GitHub token from https://github.com/settings/tokens -# Make sure to select 'repo' and 'workflow' permissions! - -# Basic usage - analyze last 100 runs -python3 ci_analyzer.py --token YOUR_GITHUB_TOKEN - -# Analyze last 1000 runs -python3 ci_analyzer.py --token YOUR_GITHUB_TOKEN --limit 1000 - -# Custom output file -python3 ci_analyzer.py --token YOUR_GITHUB_TOKEN --limit 500 --output my_analysis.json