Skip to content

Commit b1dcf5f

Browse files
Merge pull request #336 from cloudflare/ajansson/fix/gateway-double-spawn-289
fix: prevent gateway double-spawn via port probe safety net
2 parents 9bbf4c9 + 388f61a commit b1dcf5f

9 files changed

Lines changed: 126 additions & 39 deletions

File tree

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ RUN ARCH="$(dpkg --print-architecture)" \
1212
*) echo "Unsupported architecture: ${ARCH}" >&2; exit 1 ;; \
1313
esac \
1414
&& apt-get update && apt-get install -y xz-utils ca-certificates \
15+
&& rm -rf /usr/local/lib/node_modules /usr/local/bin/node /usr/local/bin/npm /usr/local/bin/npx /usr/local/bin/corepack \
1516
&& curl -fsSLk https://nodejs.org/dist/v${NODE_VERSION}/node-v${NODE_VERSION}-linux-${NODE_ARCH}.tar.xz -o /tmp/node.tar.xz \
1617
&& rm -rf /usr/local/lib/node_modules /usr/local/bin/node /usr/local/bin/npm /usr/local/bin/npx /usr/local/bin/corepack \
1718
&& tar -xJf /tmp/node.tar.xz -C /usr/local --strip-components=1 \

src/gateway/process.test.ts

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import { describe, it, expect, vi } from 'vitest';
2-
import { findExistingGatewayProcess } from './process';
2+
import { findExistingGatewayProcess, isGatewayPortOpen } from './process';
33
import type { Sandbox, Process } from '@cloudflare/sandbox';
4-
import { createMockSandbox } from '../test-utils';
4+
import { createMockSandbox, createMockExecResult } from '../test-utils';
55

66
function createFullMockProcess(overrides: Partial<Process> = {}): Process {
77
return {
@@ -143,3 +143,29 @@ describe('findExistingGatewayProcess', () => {
143143
expect(result).toBeNull();
144144
});
145145
});
146+
147+
describe('isGatewayPortOpen', () => {
148+
it('returns true when port is open (nc exits 0)', async () => {
149+
const { sandbox, execMock } = createMockSandbox();
150+
execMock.mockResolvedValue(createMockExecResult('', { exitCode: 0 }));
151+
152+
const result = await isGatewayPortOpen(sandbox);
153+
expect(result).toBe(true);
154+
expect(execMock).toHaveBeenCalledWith('nc -z localhost 18789');
155+
});
156+
157+
it('returns false when port is closed (nc exits non-zero)', async () => {
158+
const { sandbox, execMock } = createMockSandbox();
159+
execMock.mockResolvedValue(createMockExecResult('', { exitCode: 1 }));
160+
161+
const result = await isGatewayPortOpen(sandbox);
162+
expect(result).toBe(false);
163+
});
164+
165+
it('propagates errors from sandbox.exec', async () => {
166+
const { sandbox, execMock } = createMockSandbox();
167+
execMock.mockRejectedValue(new Error('container not ready'));
168+
169+
await expect(isGatewayPortOpen(sandbox)).rejects.toThrow('container not ready');
170+
});
171+
});

src/gateway/process.ts

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,15 @@ import type { OpenClawEnv } from '../types';
33
import { GATEWAY_PORT, STARTUP_TIMEOUT_MS } from '../config';
44
import { buildEnvVars } from './env';
55

6+
/**
7+
* Check if the gateway port is already listening via a TCP probe.
8+
* Used as a safety net when listProcesses() fails to detect the gateway.
9+
*/
10+
export async function isGatewayPortOpen(sandbox: Sandbox): Promise<boolean> {
11+
const result = await sandbox.exec(`nc -z localhost ${GATEWAY_PORT}`);
12+
return result.exitCode === 0;
13+
}
14+
615
/**
716
* Find an existing OpenClaw gateway process
817
*
@@ -50,9 +59,10 @@ export async function findExistingGatewayProcess(sandbox: Sandbox): Promise<Proc
5059
*
5160
* @param sandbox - The sandbox instance
5261
* @param env - Worker environment bindings
53-
* @returns The running gateway process
62+
* @returns The running gateway process, or null if the gateway is up but we
63+
* don't have a process handle (detected via port probe only)
5464
*/
55-
export async function ensureGateway(sandbox: Sandbox, env: OpenClawEnv): Promise<Process> {
65+
export async function ensureGateway(sandbox: Sandbox, env: OpenClawEnv): Promise<Process | null> {
5666
// Check if gateway is already running or starting
5767
const existingProcess = await findExistingGatewayProcess(sandbox);
5868
if (existingProcess) {
@@ -83,6 +93,20 @@ export async function ensureGateway(sandbox: Sandbox, env: OpenClawEnv): Promise
8393
}
8494
}
8595

96+
// Safety net: the process wasn't found by listProcesses() (e.g. the command
97+
// string didn't match any known pattern), but the gateway may still be running.
98+
// Probe the port directly — if it's open, the gateway is up and we're done.
99+
try {
100+
if (await isGatewayPortOpen(sandbox)) {
101+
console.log(
102+
`Port ${GATEWAY_PORT} already open — gateway running but undetected by listProcesses(), skipping spawn`,
103+
);
104+
return null;
105+
}
106+
} catch (e) {
107+
console.log('Port probe failed, proceeding to start gateway:', e);
108+
}
109+
86110
// Start a new OpenClaw gateway
87111
console.log('Starting new OpenClaw gateway...');
88112
const envVars = buildEnvVars(env);

src/index.ts

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -139,13 +139,11 @@ app.use('*', async (c, next) => {
139139
const sandbox = getSandbox(c.env.Sandbox, 'openclaw', options);
140140
c.set('sandbox', sandbox);
141141

142-
// Restore from backup on first access (idempotent, once per Worker isolate)
143-
try {
144-
await restoreIfNeeded(sandbox, c.env.BACKUP_BUCKET);
145-
} catch (err) {
146-
console.error('[middleware] Backup restore failed:', err);
147-
// Continue anyway — fresh container is better than no container
148-
}
142+
// NOTE: restoreIfNeeded is NOT called here in the global middleware.
143+
// It's called only from the catch-all route (gateway proxy) and /api/status.
144+
// Calling it on admin routes (sync, debug/cli) would mount a FUSE overlay
145+
// that interferes with createBackup — the SDK resets the overlay on backup,
146+
// wiping any upper-layer writes made since the last restore.
149147

150148
await next();
151149
});
@@ -243,6 +241,16 @@ app.all('*', async (c) => {
243241

244242
console.log('[PROXY] Handling request:', url.pathname);
245243

244+
// Restore from backup before starting the gateway.
245+
// This is only called here (catch-all) and from /api/status — NOT from admin
246+
// routes like sync or debug/cli, because the SDK resets the FUSE overlay on
247+
// createBackup, wiping upper-layer writes.
248+
try {
249+
await restoreIfNeeded(sandbox, c.env.BACKUP_BUCKET);
250+
} catch (err) {
251+
console.error('[PROXY] Backup restore failed:', err);
252+
}
253+
246254
// Check if gateway is already running
247255
const existingProcess = await findExistingGatewayProcess(sandbox);
248256
const isGatewayReady = existingProcess !== null && existingProcess.status === 'running';

src/persistence.ts

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,12 @@ async function deleteHandle(bucket: R2Bucket): Promise<void> {
2828

2929
/**
3030
* Restore the most recent backup if one exists and hasn't been restored yet.
31-
* Called on every request before proxying to the gateway.
31+
*
32+
* IMPORTANT: This must only be called from the catch-all route (gateway proxy)
33+
* and /api/status — NOT from admin routes like sync or debug/cli. The Sandbox
34+
* SDK's createBackup() resets the FUSE overlay, wiping any upper-layer writes.
35+
* If restoreIfNeeded mounts an overlay before createBackup runs, the backup
36+
* will lose files written to the upper layer.
3237
*
3338
* The backup handle is read from R2 (persisted across Worker isolate restarts).
3439
* An in-memory flag prevents redundant restores within the same isolate.
@@ -43,6 +48,16 @@ export async function restoreIfNeeded(sandbox: Sandbox, bucket: R2Bucket): Promi
4348
return;
4449
}
4550

51+
// Unmount any existing FUSE overlay before restoring. If the Worker isolate
52+
// recycled, a previous restore's overlay may still be mounted with stale
53+
// upper-layer state (e.g. deleted files via whiteout entries). A fresh
54+
// mount from the backup gives us a clean lower layer.
55+
try {
56+
await sandbox.exec(`umount ${BACKUP_DIR} 2>/dev/null; true`);
57+
} catch {
58+
// May not be mounted
59+
}
60+
4661
console.log(`[persistence] Restoring backup ${handle.id}...`);
4762
const t0 = Date.now();
4863
try {

src/routes/api.ts

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -248,28 +248,38 @@ adminApi.post('/gateway/restart', async (c) => {
248248
// Find and kill the existing gateway process
249249
const existingProcess = await findExistingGatewayProcess(sandbox);
250250

251+
// Kill via the Process API first
252+
if (existingProcess) {
253+
console.log('[Restart] Killing via Process API:', existingProcess.id);
254+
try {
255+
await existingProcess.kill();
256+
} catch {
257+
// Ignore
258+
}
259+
}
260+
251261
// Force kill the gateway via exec — more reliable than Process.kill()
252262
// because start-openclaw.sh execs into "openclaw" which forks
253263
// "openclaw-gateway". Process.kill() only kills the tracked shell PID,
254264
// but the forked child keeps port 18789. (Credit: dalexeenko #261)
255265
//
256-
// The actual process names are "openclaw" and "openclaw-gateway" (hyphenated).
266+
// Use multiple strategies since we don't know what tools are available.
257267
try {
258-
await sandbox.exec(
259-
'kill -9 $(pgrep -x "openclaw-gateway" 2>/dev/null) $(pgrep -x "openclaw" 2>/dev/null) 2>/dev/null; true',
268+
const killResult = await sandbox.exec(
269+
[
270+
// Strategy 1: pgrep by exact name (most precise)
271+
'kill -9 $(pgrep -x "openclaw-gateway" 2>/dev/null) $(pgrep -x "openclaw" 2>/dev/null) 2>/dev/null',
272+
// Strategy 2: pkill by pattern (broader match)
273+
'pkill -9 -f "openclaw" 2>/dev/null',
274+
// Strategy 3: find by port (most reliable but needs ss/fuser)
275+
'kill -9 $(ss -tlnp sport = :18789 2>/dev/null | grep -oP "pid=\\K[0-9]+") 2>/dev/null',
276+
// Always succeed
277+
'true',
278+
].join('; '),
260279
);
261-
} catch {
262-
// Process may not exist or pgrep not available
263-
}
264-
265-
// Also kill via the Process API for completeness
266-
if (existingProcess) {
267-
console.log('Also killing via Process API:', existingProcess.id);
268-
try {
269-
await existingProcess.kill();
270-
} catch {
271-
// Ignore
272-
}
280+
console.log('[Restart] Kill result:', killResult.stdout?.trim(), killResult.stderr?.trim());
281+
} catch (e) {
282+
console.error('[Restart] Kill exec failed:', e);
273283
}
274284

275285
// Clean up lock files that prevent restart
@@ -281,13 +291,11 @@ adminApi.post('/gateway/restart', async (c) => {
281291
// Ignore
282292
}
283293

284-
// Wait for process to fully die and verify port is free
285-
await new Promise((r) => setTimeout(r, 2000));
294+
// Wait for process to fully die and verify
295+
await new Promise((r) => setTimeout(r, 3000));
286296
try {
287-
const check = await sandbox.exec(
288-
'pgrep -x "openclaw-gateway" && echo "STILL ALIVE" || echo "dead"',
289-
);
290-
console.log('[Restart] Process check after kill:', check.stdout?.trim());
297+
const check = await sandbox.exec('ps aux | grep -v grep | grep openclaw || echo "ALL DEAD"');
298+
console.log('[Restart] Surviving processes:', check.stdout?.trim());
291299
} catch {
292300
// Ignore
293301
}

src/routes/public.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import { Hono } from 'hono';
22
import type { AppEnv } from '../types';
33
import { GATEWAY_PORT } from '../config';
44
import { findExistingGatewayProcess, ensureGateway } from '../gateway';
5+
import { restoreIfNeeded } from '../persistence';
56

67
/**
78
* Public routes - NO Cloudflare Access authentication required
@@ -34,6 +35,13 @@ publicRoutes.get('/logo-small.png', (c) => {
3435
publicRoutes.get('/api/status', async (c) => {
3536
const sandbox = c.get('sandbox');
3637

38+
// Restore from backup before checking/starting the gateway
39+
try {
40+
await restoreIfNeeded(sandbox, c.env.BACKUP_BUCKET);
41+
} catch (err) {
42+
console.error('[api/status] Backup restore failed:', err);
43+
}
44+
3745
try {
3846
let process = await findExistingGatewayProcess(sandbox);
3947
if (!process) {

start-openclaw.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,9 @@ config.gateway.port = 18789;
8888
config.gateway.mode = 'local';
8989
config.gateway.trustedProxies = ['10.1.0.0'];
9090
91+
config.gateway.controlUi = config.gateway.controlUi || {};
92+
config.gateway.controlUi.allowedOrigins = ['*'];
93+
9194
if (process.env.OPENCLAW_GATEWAY_TOKEN) {
9295
config.gateway.auth = config.gateway.auth || {};
9396
config.gateway.auth.token = process.env.OPENCLAW_GATEWAY_TOKEN;

test/e2e/r2_persistence.txt

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,6 @@ create workspace marker file
6060
%require
6161
===
6262
WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt")
63-
# Write directly to /home/openclaw (the backup dir) rather than via
64-
# the /root/clawd symlink, to avoid FUSE overlay resolution issues.
6563
./curl-auth -s "$WORKER_URL/debug/cli?cmd=echo+e2e-persistence-test+%3E+/home/openclaw/clawd/e2e-marker.txt+%26%26+cat+/home/openclaw/clawd/e2e-marker.txt" | jq .
6664
---
6765
{{ result: json object }}
@@ -86,7 +84,6 @@ delete marker file locally and confirm gone
8684
%require
8785
===
8886
WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt")
89-
# Use rm -f and verify absence separately to tolerate FUSE overlay quirks
9087
./curl-auth -s "$WORKER_URL/debug/cli?cmd=rm+-f+/home/openclaw/clawd/e2e-marker.txt+%26%26+test+!+-f+/home/openclaw/clawd/e2e-marker.txt+%26%26+echo+deleted-and-gone" | jq .
9188
---
9289
{{ result: json object }}
@@ -111,9 +108,6 @@ wait for gateway to restart after restore
111108
%require
112109
===
113110
WORKER_URL=$(cat "$CCTR_FIXTURE_DIR/worker-url.txt")
114-
# The restart handler kills the old process (waiting for it to die) and
115-
# clears the persistence cache. Hitting /api/status starts a new gateway
116-
# (after running restoreIfNeeded). Poll until the gateway is up.
117111
for i in $(seq 1 30); do
118112
STATUS=$(./curl-auth -s "$WORKER_URL/api/status" 2>/dev/null || echo "")
119113
OK=$(echo "$STATUS" | jq -r '.ok // false' 2>/dev/null)

0 commit comments

Comments
 (0)