Skip to content

Commit 4600691

Browse files
authored
fix(supervisor): triage PR #392 review feedback - stderr, sort, deploy errors (t147.2) (#451)
* chore: mark t142 complete in TODO.md (#449) * chore: mark t135.9 complete in TODO.md (#448) * fix(supervisor): triage PR #392 review feedback - stderr, sort, deploy errors (t147.2) - Add post-PR states to ORDER BY CASE in cmd_status and cmd_list - Add 300s timeout to deploy setup.sh with portable fallback - Deploy failures now transition to 'failed' instead of silently marking 'deployed' - Redirect cmd_pr_lifecycle output to post-pr.log instead of /dev/null - Fix missing $SUPERVISOR_DB arg in db() calls for no_pr retry counter - Remove unused no_pr_key variable Dismissed 3 of 6 review threads as already fixed or invalid: - gh pr merge stderr: already uses 2>&1 - Fallback PR lookup: already uses git -C and gh pr list --repo - Missing pr_review:deployed transition: not needed, complete:deployed handles it
1 parent 474abc8 commit 4600691

1 file changed

Lines changed: 70 additions & 23 deletions

File tree

.agents/scripts/supervisor-helper.sh

Lines changed: 70 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -873,10 +873,15 @@ cmd_status() {
873873
WHEN 'evaluating' THEN 3
874874
WHEN 'retrying' THEN 4
875875
WHEN 'queued' THEN 5
876-
WHEN 'blocked' THEN 6
877-
WHEN 'failed' THEN 7
878-
WHEN 'complete' THEN 8
879-
WHEN 'cancelled' THEN 9
876+
WHEN 'pr_review' THEN 6
877+
WHEN 'merging' THEN 7
878+
WHEN 'deploying' THEN 8
879+
WHEN 'blocked' THEN 9
880+
WHEN 'failed' THEN 10
881+
WHEN 'complete' THEN 11
882+
WHEN 'merged' THEN 12
883+
WHEN 'deployed' THEN 13
884+
WHEN 'cancelled' THEN 14
880885
END;
881886
" 2>/dev/null | while IFS=': ' read -r state count; do
882887
local color="$NC"
@@ -1082,10 +1087,15 @@ cmd_list() {
10821087
WHEN 'evaluating' THEN 3
10831088
WHEN 'retrying' THEN 4
10841089
WHEN 'queued' THEN 5
1085-
WHEN 'blocked' THEN 6
1086-
WHEN 'failed' THEN 7
1087-
WHEN 'complete' THEN 8
1088-
WHEN 'cancelled' THEN 9
1090+
WHEN 'pr_review' THEN 6
1091+
WHEN 'merging' THEN 7
1092+
WHEN 'deploying' THEN 8
1093+
WHEN 'blocked' THEN 9
1094+
WHEN 'failed' THEN 10
1095+
WHEN 'complete' THEN 11
1096+
WHEN 'merged' THEN 12
1097+
WHEN 'deployed' THEN 13
1098+
WHEN 'cancelled' THEN 14
10891099
END, t.created_at DESC;
10901100
")
10911101

@@ -2969,12 +2979,45 @@ run_deploy_for_task() {
29692979
return 0
29702980
fi
29712981

2972-
log_info "Running setup.sh for $task_id..."
2973-
local deploy_output
2974-
if ! deploy_output=$(cd "$repo" && ./setup.sh 2>&1); then
2975-
log_warn "Deploy (setup.sh) returned non-zero for $task_id. Output:"
2976-
log_warn "$deploy_output"
2977-
return 1
2982+
log_info "Running setup.sh for $task_id (timeout: 300s)..."
2983+
local deploy_output deploy_log
2984+
deploy_log="$SUPERVISOR_DIR/logs/${task_id}-deploy-$(date +%Y%m%d%H%M%S).log"
2985+
mkdir -p "$SUPERVISOR_DIR/logs"
2986+
2987+
# Portable timeout: prefer timeout/gtimeout, fall back to background+kill
2988+
local timeout_cmd=""
2989+
if command -v timeout &>/dev/null; then
2990+
timeout_cmd="timeout 300"
2991+
elif command -v gtimeout &>/dev/null; then
2992+
timeout_cmd="gtimeout 300"
2993+
fi
2994+
2995+
if [[ -n "$timeout_cmd" ]]; then
2996+
if ! deploy_output=$(cd "$repo" && $timeout_cmd ./setup.sh 2>&1); then
2997+
log_warn "Deploy (setup.sh) returned non-zero for $task_id (see $deploy_log)"
2998+
echo "$deploy_output" > "$deploy_log" 2>/dev/null || true
2999+
return 1
3000+
fi
3001+
else
3002+
# Fallback: background process with manual timeout
3003+
(cd "$repo" && ./setup.sh > "$deploy_log" 2>&1) &
3004+
local deploy_pid=$!
3005+
local waited=0
3006+
while kill -0 "$deploy_pid" 2>/dev/null && [[ "$waited" -lt 300 ]]; do
3007+
sleep 5
3008+
waited=$((waited + 5))
3009+
done
3010+
if kill -0 "$deploy_pid" 2>/dev/null; then
3011+
kill "$deploy_pid" 2>/dev/null || true
3012+
log_warn "Deploy (setup.sh) timed out after 300s for $task_id (see $deploy_log)"
3013+
return 1
3014+
fi
3015+
if ! wait "$deploy_pid"; then
3016+
deploy_output=$(cat "$deploy_log" 2>/dev/null || echo "")
3017+
log_warn "Deploy (setup.sh) returned non-zero for $task_id (see $deploy_log)"
3018+
return 1
3019+
fi
3020+
deploy_output=$(cat "$deploy_log" 2>/dev/null || echo "")
29783021
fi
29793022
log_success "Deploy complete for $task_id"
29803023
return 0
@@ -3163,9 +3206,8 @@ cmd_pr_lifecycle() {
31633206
;;
31643207
no_pr)
31653208
# Track consecutive no_pr failures to avoid infinite retry loop
3166-
local no_pr_key="no_pr_retries_${task_id}"
31673209
local no_pr_count
3168-
no_pr_count=$(db "SELECT COALESCE(
3210+
no_pr_count=$(db "$SUPERVISOR_DB" "SELECT COALESCE(
31693211
(SELECT CAST(json_extract(error, '$.no_pr_retries') AS INTEGER)
31703212
FROM tasks WHERE id='$task_id'), 0);" 2>/dev/null || echo "0")
31713213
no_pr_count=$((no_pr_count + 1))
@@ -3183,7 +3225,7 @@ cmd_pr_lifecycle() {
31833225

31843226
log_warn "No PR found for $task_id (attempt $no_pr_count/5)"
31853227
# Store retry count in error field as JSON
3186-
db "UPDATE tasks SET error = json_set(COALESCE(error, '{}'), '$.no_pr_retries', $no_pr_count), updated_at = strftime('%Y-%m-%dT%H:%M:%SZ','now') WHERE id='$task_id';" 2>/dev/null || true
3228+
db "$SUPERVISOR_DB" "UPDATE tasks SET error = json_set(COALESCE(error, '{}'), '$.no_pr_retries', $no_pr_count), updated_at = strftime('%Y-%m-%dT%H:%M:%SZ','now') WHERE id='$task_id';" 2>/dev/null || true
31873229
return 0
31883230
;;
31893231
esac
@@ -3210,16 +3252,21 @@ cmd_pr_lifecycle() {
32103252
if [[ "$dry_run" == "false" ]]; then
32113253
cmd_transition "$task_id" "deploying" || log_warn "Failed to transition $task_id to deploying"
32123254

3213-
# Pull main and run postflight
3255+
# Pull main and run postflight (non-blocking: verification only)
32143256
run_postflight_for_task "$task_id" "$trepo" || log_warn "Postflight issue for $task_id (non-blocking)"
32153257

3216-
# Deploy (aidevops repos only)
3217-
run_deploy_for_task "$task_id" "$trepo" || log_warn "Deploy issue for $task_id (non-blocking)"
3258+
# Deploy (aidevops repos only) - failure blocks deployed transition
3259+
if ! run_deploy_for_task "$task_id" "$trepo"; then
3260+
log_error "Deploy failed for $task_id - transitioning to failed"
3261+
cmd_transition "$task_id" "failed" --error "Deploy (setup.sh) failed" 2>/dev/null || true
3262+
send_task_notification "$task_id" "failed" "Deploy failed after merge" 2>/dev/null || true
3263+
return 1
3264+
fi
32183265

3219-
# Clean up worktree and branch
3266+
# Clean up worktree and branch (non-blocking: housekeeping)
32203267
cleanup_after_merge "$task_id" || log_warn "Worktree cleanup issue for $task_id (non-blocking)"
32213268

3222-
# Update TODO.md
3269+
# Update TODO.md (non-blocking: housekeeping)
32233270
update_todo_on_complete "$task_id" || log_warn "TODO.md update issue for $task_id (non-blocking)"
32243271

32253272
# Final transition
@@ -3279,7 +3326,7 @@ process_post_pr_lifecycle() {
32793326
fi
32803327

32813328
log_info " $tid: processing post-PR lifecycle (status: $tstatus)"
3282-
if cmd_pr_lifecycle "$tid" 2>/dev/null; then
3329+
if cmd_pr_lifecycle "$tid" >> "$SUPERVISOR_DIR/post-pr.log" 2>&1; then
32833330
local new_status
32843331
new_status=$(db "$SUPERVISOR_DB" "SELECT status FROM tasks WHERE id = '$(sql_escape "$tid")';" 2>/dev/null || echo "")
32853332
case "$new_status" in

0 commit comments

Comments
 (0)