Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ func (a *APIStore) PostAdminTeamsTeamIDSandboxesKill(c *gin.Context, teamID uuid
// Kill each sandbox
for _, sbx := range sandboxes {
wg.Go(func() error {
err := a.orchestrator.RemoveSandbox(ctx, sbx, sandbox.StateActionKill)
err := a.orchestrator.RemoveSandbox(ctx, sbx.TeamID, sbx.SandboxID, sandbox.StateActionKill)
if err != nil {
logger.L().Error(ctx, "Failed to kill sandbox",
logger.WithSandboxID(sbx.SandboxID),
Expand Down
42 changes: 15 additions & 27 deletions packages/api/internal/handlers/sandbox_kill.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,33 +66,21 @@ func (a *APIStore) DeleteSandboxesSandboxID(

killedOrRemoved := false

sbx, err := a.orchestrator.GetSandbox(ctx, teamID, sandboxID)
if err == nil {
if sbx.TeamID != teamID {
logger.L().Debug(ctx, "Sandbox team mismatch on kill", logger.WithSandboxID(sandboxID), logger.WithTeamID(teamID.String()))
a.sendAPIStoreError(c, http.StatusNotFound, sandboxNotFoundMsg(sandboxID))

return
}

err = a.orchestrator.RemoveSandbox(ctx, sbx, sandbox.StateActionKill)
switch {
case err == nil:
killedOrRemoved = true
case errors.Is(err, orchestrator.ErrSandboxNotFound):
logger.L().Debug(ctx, "Sandbox not found", logger.WithSandboxID(sandboxID))
case errors.Is(err, orchestrator.ErrSandboxOperationFailed):
a.sendAPIStoreError(c, http.StatusInternalServerError, fmt.Sprintf("Error killing sandbox: %s", err))

return
default:
telemetry.ReportError(ctx, "error killing sandbox", err)
a.sendAPIStoreError(c, http.StatusInternalServerError, fmt.Sprintf("Error killing sandbox: %s", err))

return
}
} else {
logger.L().Debug(ctx, "Sandbox not found", logger.WithSandboxID(sandboxID))
err = a.orchestrator.RemoveSandbox(ctx, teamID, sandboxID, sandbox.StateActionKill)
switch {
case err == nil:
killedOrRemoved = true
case errors.Is(err, orchestrator.ErrSandboxNotFound):
logger.L().Debug(ctx, "Running sandbox not found", logger.WithSandboxID(sandboxID))
case errors.Is(err, orchestrator.ErrSandboxOperationFailed):
a.sendAPIStoreError(c, http.StatusInternalServerError, fmt.Sprintf("Error killing sandbox: %s", err))

return
default:
telemetry.ReportError(ctx, "error killing sandbox", err)
a.sendAPIStoreError(c, http.StatusInternalServerError, fmt.Sprintf("Error killing sandbox: %s", err))

return
}

// remove any snapshots when the sandbox is not running
Expand Down
18 changes: 1 addition & 17 deletions packages/api/internal/handlers/sandbox_pause.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,23 +40,7 @@ func (a *APIStore) PostSandboxesSandboxIDPause(c *gin.Context, sandboxID api.San
traceID := span.SpanContext().TraceID().String()
c.Set("traceID", traceID)

sbx, err := a.orchestrator.GetSandbox(ctx, teamID, sandboxID)
if err != nil {
apiErr := pauseHandleNotRunningSandbox(ctx, a.sqlcDB, sandboxID, teamID)
a.sendAPIStoreError(c, apiErr.Code, apiErr.ClientMsg)

return
}

if sbx.TeamID != teamID {
logger.L().Debug(ctx, "Sandbox team mismatch on pause", logger.WithSandboxID(sandboxID), logger.WithTeamID(teamID.String()))
a.sendAPIStoreError(c, http.StatusNotFound, sandboxNotFoundMsg(sandboxID))

return
}

err = a.orchestrator.RemoveSandbox(ctx, sbx, sandbox.StateActionPause)

err = a.orchestrator.RemoveSandbox(ctx, teamID, sandboxID, sandbox.StateActionPause)
var transErr *sandbox.InvalidStateTransitionError

switch {
Expand Down
19 changes: 3 additions & 16 deletions packages/api/internal/handlers/snapshot_template_create.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,9 @@ func (a *APIStore) PostSandboxesSandboxIDSnapshots(c *gin.Context, sandboxID api
opts.Namespace = &teamInfo.Slug
}

sbx, err := a.orchestrator.GetSandbox(ctx, teamID, sandboxID)
telemetry.ReportEvent(ctx, "Creating snapshot template")

result, err := a.orchestrator.CreateSnapshotTemplate(ctx, teamID, sandboxID, opts)
if err != nil {
var notFoundErr *sandbox.NotFoundError
if errors.As(err, &notFoundErr) {
Expand All @@ -104,22 +106,7 @@ func (a *APIStore) PostSandboxesSandboxIDSnapshots(c *gin.Context, sandboxID api

return
}
a.sendAPIStoreError(c, http.StatusInternalServerError, "Error getting sandbox")

return
}

if sbx.TeamID != teamID {
logger.L().Debug(ctx, "Sandbox team mismatch on snapshot", logger.WithSandboxID(sandboxID), logger.WithTeamID(teamID.String()))
a.sendAPIStoreError(c, http.StatusNotFound, sandboxNotFoundMsg(sandboxID))

return
}

telemetry.ReportEvent(ctx, "Creating snapshot template")

result, err := a.orchestrator.CreateSnapshotTemplate(ctx, teamID, sandboxID, opts)
if err != nil {
var transErr *sandbox.InvalidStateTransitionError
if errors.As(err, &transErr) {
a.sendAPIStoreError(c, http.StatusConflict, fmt.Sprintf("Sandbox '%s' cannot be snapshotted while in '%s' state", sandboxID, transErr.CurrentState))
Expand Down
5 changes: 3 additions & 2 deletions packages/api/internal/orchestrator/analytics.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"time"

"github.com/google/uuid"
"github.com/posthog/posthog-go"
"go.opentelemetry.io/otel/metric"
"go.uber.org/zap"
Expand Down Expand Up @@ -89,6 +90,6 @@ func (o *Orchestrator) sandboxCounterInsert(ctx context.Context, sandbox sandbox
o.sandboxCounter.Add(ctx, 1, metric.WithAttributes(telemetry.WithTeamID(sandbox.TeamID.String())))
}

func (o *Orchestrator) countersRemove(ctx context.Context, sandbox sandbox.Sandbox, _ sandbox.StateAction) {
o.sandboxCounter.Add(ctx, -1, metric.WithAttributes(telemetry.WithTeamID(sandbox.TeamID.String())))
func (o *Orchestrator) countersRemove(ctx context.Context, teamID uuid.UUID, _ sandbox.StateAction) {
o.sandboxCounter.Add(ctx, -1, metric.WithAttributes(telemetry.WithTeamID(teamID.String())))
}
24 changes: 19 additions & 5 deletions packages/api/internal/orchestrator/delete_instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"errors"
"fmt"

"github.com/google/uuid"
"go.uber.org/zap"

"github.com/e2b-dev/infra/packages/api/internal/sandbox"
Expand All @@ -14,15 +15,21 @@ import (
sbxlogger "github.com/e2b-dev/infra/packages/shared/pkg/logger/sandbox"
)

func (o *Orchestrator) RemoveSandbox(ctx context.Context, sbx sandbox.Sandbox, stateAction sandbox.StateAction) error {
func (o *Orchestrator) RemoveSandbox(ctx context.Context, teamID uuid.UUID, sandboxID string, stateAction sandbox.StateAction) error {
ctx, span := tracer.Start(ctx, "remove-sandbox")
defer span.End()

sandboxID := sbx.SandboxID
alreadyDone, finish, err := o.sandboxStore.StartRemoving(ctx, sbx.TeamID, sandboxID, stateAction)
sbx, alreadyDone, finish, err := o.sandboxStore.StartRemoving(ctx, teamID, sandboxID, stateAction)
if err != nil {
switch stateAction {
case sandbox.StateActionKill:
var notFoundErr *sandbox.NotFoundError
if errors.As(err, &notFoundErr) {
logger.L().Info(ctx, "Sandbox not found, already removed", logger.WithSandboxID(sandboxID))

return ErrSandboxNotFound
}

switch sbx.State {
case sandbox.StateKilling:
logger.L().Info(ctx, "Sandbox is already killed", logger.WithSandboxID(sandboxID))
Expand All @@ -34,6 +41,13 @@ func (o *Orchestrator) RemoveSandbox(ctx context.Context, sbx sandbox.Sandbox, s
return ErrSandboxOperationFailed
}
case sandbox.StateActionPause:
var notFoundErrPause *sandbox.NotFoundError
if errors.As(err, &notFoundErrPause) {
logger.L().Info(ctx, "Sandbox not found for pause", logger.WithSandboxID(sandboxID))

return ErrSandboxNotFound
}

var transErr *sandbox.InvalidStateTransitionError
if errors.As(err, &transErr) {
if transErr.CurrentState == sandbox.StateKilling {
Expand Down Expand Up @@ -64,9 +78,9 @@ func (o *Orchestrator) RemoveSandbox(ctx context.Context, sbx sandbox.Sandbox, s
return nil
}

defer func() { go o.countersRemove(context.WithoutCancel(ctx), sbx, stateAction) }()
defer func() { go o.countersRemove(context.WithoutCancel(ctx), teamID, stateAction) }()
defer func() { go o.analyticsRemove(context.WithoutCancel(ctx), sbx, stateAction) }()
defer o.sandboxStore.Remove(ctx, sbx.TeamID, sbx.SandboxID)
defer o.sandboxStore.Remove(ctx, teamID, sandboxID)
err = o.removeSandboxFromNode(ctx, sbx, stateAction)
if err != nil {
logger.L().Error(ctx, "Error pausing sandbox", zap.Error(err), logger.WithSandboxID(sbx.SandboxID))
Expand Down
7 changes: 4 additions & 3 deletions packages/api/internal/orchestrator/evictor/evict.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"time"

"github.com/google/uuid"
"go.uber.org/zap"
"golang.org/x/sync/errgroup"

Expand All @@ -17,12 +18,12 @@ const (

type Evictor struct {
store *sandbox.Store
removeSandbox func(ctx context.Context, sandbox sandbox.Sandbox, stateAction sandbox.StateAction) error
removeSandbox func(ctx context.Context, teamID uuid.UUID, sandboxID string, stateAction sandbox.StateAction) error
}

func New(
store *sandbox.Store,
removeSandbox func(ctx context.Context, sandbox sandbox.Sandbox, stateAction sandbox.StateAction) error,
removeSandbox func(ctx context.Context, teamID uuid.UUID, sandboxID string, stateAction sandbox.StateAction) error,
) *Evictor {
return &Evictor{
store: store,
Expand Down Expand Up @@ -58,7 +59,7 @@ func (e *Evictor) Start(ctx context.Context) {
}

logger.L().Debug(ctx, "Evicting sandbox", logger.WithSandboxID(item.SandboxID), zap.String("state_action", stateAction.Name))
if err := e.removeSandbox(context.WithoutCancel(ctx), item, stateAction); err != nil {
if err := e.removeSandbox(context.WithoutCancel(ctx), item.TeamID, item.SandboxID, stateAction); err != nil {
logger.L().Debug(ctx, "Evicting sandbox failed", zap.Error(err), logger.WithSandboxID(item.SandboxID))
}

Expand Down
9 changes: 2 additions & 7 deletions packages/api/internal/orchestrator/snapshot_template.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,7 @@ func (o *Orchestrator) CreateSnapshotTemplate(ctx context.Context, teamID uuid.U
ctx, span := tracer.Start(ctx, "create-snapshot-template")
defer span.End()

sbx, err := o.sandboxStore.Get(ctx, teamID, sandboxID)
if err != nil {
return SnapshotTemplateResult{}, fmt.Errorf("failed to get sandbox: %w", err)
}

alreadyDone, finishSnapshotting, err := o.sandboxStore.StartRemoving(ctx, teamID, sandboxID, sandbox.StateActionSnapshot)
sbx, alreadyDone, finishSnapshotting, err := o.sandboxStore.StartRemoving(ctx, teamID, sandboxID, sandbox.StateActionSnapshot)
if err != nil {
return SnapshotTemplateResult{}, fmt.Errorf("failed to start snapshotting: %w", err)
}
Expand Down Expand Up @@ -102,7 +97,7 @@ func (o *Orchestrator) CreateSnapshotTemplate(ctx context.Context, teamID uuid.U
// so RemoveSandbox can proceed without deadlock.
finish(err)

if killErr := o.RemoveSandbox(ctx, sbx, sandbox.StateActionKill); killErr != nil {
if killErr := o.RemoveSandbox(ctx, teamID, sandboxID, sandbox.StateActionKill); killErr != nil {
telemetry.ReportError(ctx, "error killing sandbox after failed checkpoint", killErr)
}

Expand Down
13 changes: 10 additions & 3 deletions packages/api/internal/sandbox/storage/memory/operations.go
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,20 @@ func (s *Storage) Update(_ context.Context, _ uuid.UUID, sandboxID string, updat
return sbx, nil
}

func (s *Storage) StartRemoving(ctx context.Context, _ uuid.UUID, sandboxID string, stateAction sandbox.StateAction) (alreadyDone bool, callback func(context.Context, error), err error) {
func (s *Storage) StartRemoving(ctx context.Context, teamID uuid.UUID, sandboxID string, stateAction sandbox.StateAction) (sandbox.Sandbox, bool, func(context.Context, error), error) {
sbx, err := s.get(sandboxID)
if err != nil {
return false, nil, err
return sandbox.Sandbox{}, false, nil, &sandbox.NotFoundError{SandboxID: sandboxID}
}

return startRemoving(ctx, sbx, stateAction)
data := sbx.Data()
if data.TeamID != teamID {
return sandbox.Sandbox{}, false, nil, &sandbox.NotFoundError{SandboxID: sandboxID}
}

alreadyDone, callback, err := startRemoving(ctx, sbx, stateAction)

return data, alreadyDone, callback, err
}

func startRemoving(ctx context.Context, sbx *memorySandbox, stateAction sandbox.StateAction) (alreadyDone bool, callback func(ctx context.Context, err error), err error) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ func TestStartRemoving_DuringSnapshotting(t *testing.T) {
err := storage.Add(ctx, sbx)
require.NoError(t, err)

snapAlreadyDone, finishSnap, err := storage.StartRemoving(ctx, sbx.TeamID, sbx.SandboxID, sandbox.StateActionSnapshot)
_, snapAlreadyDone, finishSnap, err := storage.StartRemoving(ctx, sbx.TeamID, sbx.SandboxID, sandbox.StateActionSnapshot)
require.NoError(t, err)
assert.False(t, snapAlreadyDone)
require.NotNil(t, finishSnap)
Expand All @@ -461,7 +461,7 @@ func TestStartRemoving_DuringSnapshotting(t *testing.T) {

go func() {
defer close(pauseDone)
pauseAlreadyDone, pauseFinish, pauseErr = storage.StartRemoving(ctx, sbx.TeamID, sbx.SandboxID, sandbox.StateActionPause)
_, pauseAlreadyDone, pauseFinish, pauseErr = storage.StartRemoving(ctx, sbx.TeamID, sbx.SandboxID, sandbox.StateActionPause)
}()

time.Sleep(50 * time.Millisecond)
Expand Down Expand Up @@ -505,7 +505,7 @@ func TestStartRemoving_DuringSnapshotting(t *testing.T) {
err := storage.Add(ctx, sbx)
require.NoError(t, err)

snapAlreadyDone, finishSnap, err := storage.StartRemoving(ctx, sbx.TeamID, sbx.SandboxID, sandbox.StateActionSnapshot)
_, snapAlreadyDone, finishSnap, err := storage.StartRemoving(ctx, sbx.TeamID, sbx.SandboxID, sandbox.StateActionSnapshot)
require.NoError(t, err)
assert.False(t, snapAlreadyDone)

Expand All @@ -517,7 +517,7 @@ func TestStartRemoving_DuringSnapshotting(t *testing.T) {

go func() {
defer close(killDone)
killAlreadyDone, killFinish, killErr = storage.StartRemoving(ctx, sbx.TeamID, sbx.SandboxID, sandbox.StateActionKill)
_, killAlreadyDone, killFinish, killErr = storage.StartRemoving(ctx, sbx.TeamID, sbx.SandboxID, sandbox.StateActionKill)
}()

// Give the kill goroutine time to start waiting
Expand Down Expand Up @@ -564,7 +564,7 @@ func TestStartRemoving_DuringSnapshotting(t *testing.T) {
err := storage.Add(ctx, sbx)
require.NoError(t, err)

_, finishSnap, err := storage.StartRemoving(ctx, sbx.TeamID, sbx.SandboxID, sandbox.StateActionSnapshot)
_, _, finishSnap, err := storage.StartRemoving(ctx, sbx.TeamID, sbx.SandboxID, sandbox.StateActionSnapshot)
require.NoError(t, err)

// Finish with error — state stays Snapshotting, transition cleared
Expand All @@ -575,7 +575,7 @@ func TestStartRemoving_DuringSnapshotting(t *testing.T) {
assert.Equal(t, sandbox.StateSnapshotting, got.State)

// Kill proceeds immediately — no active transition, Snapshotting→Killing is allowed
killAlreadyDone, killFinish, killErr := storage.StartRemoving(ctx, sbx.TeamID, sbx.SandboxID, sandbox.StateActionKill)
_, killAlreadyDone, killFinish, killErr := storage.StartRemoving(ctx, sbx.TeamID, sbx.SandboxID, sandbox.StateActionKill)
require.NoError(t, killErr)
assert.False(t, killAlreadyDone)
require.NotNil(t, killFinish)
Expand Down Expand Up @@ -607,7 +607,7 @@ func TestStartRemoving_DuringSnapshotting(t *testing.T) {
err := storage.Add(ctx, sbx)
require.NoError(t, err)

snapAlreadyDone, finishSnap, err := storage.StartRemoving(ctx, sbx.TeamID, sbx.SandboxID, sandbox.StateActionSnapshot)
_, snapAlreadyDone, finishSnap, err := storage.StartRemoving(ctx, sbx.TeamID, sbx.SandboxID, sandbox.StateActionSnapshot)
require.NoError(t, err)
assert.False(t, snapAlreadyDone)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ func (m *PopulateRedisStorage) Update(ctx context.Context, teamID uuid.UUID, san
return sbx, nil
}

func (m *PopulateRedisStorage) StartRemoving(ctx context.Context, teamID uuid.UUID, sandboxID string, stateAction sandbox.StateAction) (alreadyDone bool, callback func(context.Context, error), err error) {
func (m *PopulateRedisStorage) StartRemoving(ctx context.Context, teamID uuid.UUID, sandboxID string, stateAction sandbox.StateAction) (sandbox.Sandbox, bool, func(context.Context, error), error) {
return m.memoryBackend.StartRemoving(ctx, teamID, sandboxID, stateAction)
}

Expand Down
Loading
Loading