Skip to content

Commit f736d72

Browse files
committed
fix: detect Execve failures after StartSuccess signal
Signed-off-by: Sanchit2662 <sanchit2662@gmail.com>
1 parent 66f01ee commit f736d72

File tree

3 files changed

+70
-10
lines changed

3 files changed

+70
-10
lines changed

cmd/urunc/start.go

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,16 @@ import (
1919
"errors"
2020
"fmt"
2121
"os"
22+
"time"
2223

2324
"github.com/sirupsen/logrus"
2425
"github.com/urfave/cli/v3"
2526
m "github.com/urunc-dev/urunc/internal/metrics"
2627
"github.com/urunc-dev/urunc/pkg/unikontainers"
2728
)
2829

30+
const execveErrorCheckTimeout = 100 * time.Millisecond
31+
2932
var startCommand = &cli.Command{
3033
Name: "start",
3134
Usage: "executes the user defined process in a created container",
@@ -98,10 +101,10 @@ func startUnikontainer(cmd *cli.Command) error {
98101
}
99102
metrics.Capture(m.TS13)
100103

101-
// wait ContainerStarted message on start.sock from reexec process
102-
err = unikontainer.AwaitMsg(unikontainers.StartSuccess)
104+
// Wait for success, then briefly check for error (catches Execve failures)
105+
err = unikontainer.AwaitMsgWithErrorCheck(unikontainers.StartSuccess, unikontainers.StartErr, execveErrorCheckTimeout)
103106
if err != nil {
104-
err = fmt.Errorf("failed to get message from successful start from reexec: %w", err)
107+
err = fmt.Errorf("failed to start container: %w", err)
105108
return err
106109
}
107110

pkg/unikontainers/ipc.go

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,3 +169,58 @@ func AwaitMessage(listener *net.UnixListener, expectedMessage IPCMessage) error
169169
}
170170
return nil
171171
}
172+
173+
// AwaitMessageWithErrorCheck waits for a success message, then briefly waits for
174+
// a potential error message to catch Execve failures after StartSuccess is sent.
175+
func AwaitMessageWithErrorCheck(listener *net.UnixListener, successMsg, errorMsg IPCMessage, errorCheckTimeout time.Duration) error {
176+
conn, err := listener.AcceptUnix()
177+
if err != nil {
178+
return err
179+
}
180+
defer func() {
181+
closeErr := conn.Close()
182+
if closeErr != nil {
183+
logrus.WithError(closeErr).Error("failed to close connection")
184+
}
185+
}()
186+
187+
bufSize := len(successMsg)
188+
if len(errorMsg) > bufSize {
189+
bufSize = len(errorMsg)
190+
}
191+
192+
buf := make([]byte, bufSize)
193+
n, err := conn.Read(buf)
194+
if err != nil {
195+
return fmt.Errorf("failed to read from socket: %w", err)
196+
}
197+
msg := string(buf[0:n])
198+
199+
if msg == string(errorMsg) {
200+
return fmt.Errorf("container execution failed: received error signal from reexec")
201+
}
202+
203+
if msg != string(successMsg) {
204+
return fmt.Errorf("received unexpected message: %s", msg)
205+
}
206+
207+
// Wait briefly for potential error after success (catches Execve failures)
208+
if errorCheckTimeout > 0 {
209+
if err = conn.SetReadDeadline(time.Now().Add(errorCheckTimeout)); err != nil {
210+
logrus.WithError(err).Warn("failed to set read deadline for error check")
211+
return nil
212+
}
213+
214+
errBuf := make([]byte, len(errorMsg))
215+
n, readErr := conn.Read(errBuf)
216+
if readErr == nil && n > 0 {
217+
errMsg := string(errBuf[0:n])
218+
if errMsg == string(errorMsg) {
219+
return fmt.Errorf("VMM execution failed after signaling start")
220+
}
221+
logrus.WithField("message", errMsg).Warn("received unexpected message after success")
222+
}
223+
}
224+
225+
return nil
226+
}

pkg/unikontainers/unikontainers.go

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"strings"
2929
"sync"
3030
"syscall"
31+
"time"
3132

3233
"github.com/urunc-dev/urunc/pkg/network"
3334
"github.com/urunc-dev/urunc/pkg/unikontainers/hypervisors"
@@ -493,12 +494,8 @@ func (u *Unikontainer) Exec(metrics m.Writer) error {
493494

494495
uniklog.Debug("calling vmm execve")
495496
metrics.Capture(m.TS18)
496-
// metrics.Wait()
497-
// TODO: We set the state to running and notify urunc Start that the monitor
498-
// started, but we might encounter issues with the monitor execution. We need
499-
// to revisit this and check if a failed monitor execution affects this approach.
500-
// If it affects then we need to re-design the whole spawning of the monitor.
501-
// Notify urunc start
497+
// Notify urunc start. If Execve fails after this, urunc start will detect it
498+
// via AwaitMsgWithErrorCheck which waits briefly for a potential StartErr.
502499
err = u.SendMessage(StartSuccess)
503500
if err != nil {
504501
return err
@@ -1102,11 +1099,16 @@ func (u *Unikontainer) DestroyConn(isReexec bool) error {
11021099
return nil
11031100
}
11041101

1105-
// AwaitMessage waits for a specific message in the listener of unikontainer instance
1102+
// AwaitMsg waits for a specific message in the listener of unikontainer instance
11061103
func (u *Unikontainer) AwaitMsg(msg IPCMessage) error {
11071104
return AwaitMessage(u.Listener, msg)
11081105
}
11091106

1107+
// AwaitMsgWithErrorCheck waits for success, then checks for a follow-up error message
1108+
func (u *Unikontainer) AwaitMsgWithErrorCheck(successMsg, errorMsg IPCMessage, errorTimeout time.Duration) error {
1109+
return AwaitMessageWithErrorCheck(u.Listener, successMsg, errorMsg, errorTimeout)
1110+
}
1111+
11101112
// SendMessage sends message over the active connection
11111113
func (u *Unikontainer) SendMessage(message IPCMessage) error {
11121114
conn := u.Conn

0 commit comments

Comments
 (0)