diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 549bffef8550c..743ab5dc2e658 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -602,8 +602,8 @@ jobs: sudo BOX=$BOX RUNC_FLAVOR=$RUNC_FLAVOR vagrant halt sudo BOX=$BOX RUNC_FLAVOR=$RUNC_FLAVOR vagrant up fi - - name: test-integration - run: sudo BOX=$BOX RUNC_FLAVOR=$RUNC_FLAVOR vagrant up --provision-with=selinux,install-runc,install-gotestsum,test-integration + # - name: test-integration + # run: sudo BOX=$BOX RUNC_FLAVOR=$RUNC_FLAVOR vagrant up --provision-with=selinux,install-runc,install-gotestsum,test-integration - name: test-cri-integration run: sudo BOX=$BOX CGROUP_DRIVER=$CGROUP_DRIVER RUNC_FLAVOR=$RUNC_FLAVOR vagrant up --provision-with=selinux,install-runc,install-gotestsum,test-cri-integration - name: test-cri diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000000000..d24164406e13b --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,16 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "debug", + "type": "go", + "request": "launch", + "mode": "auto", + "env": {"GO111MODULE":"on"}, + "program": "${workspaceFolder}/cmd/containerd" + } + ] +} \ No newline at end of file diff --git a/client/process.go b/client/process.go index e451eaa7d169c..f4ef7075a8a1d 100644 --- a/client/process.go +++ b/client/process.go @@ -178,6 +178,7 @@ func (p *process) Wait(ctx context.Context) (<-chan ExitStatus, error) { ContainerID: p.task.id, ExecID: p.id, }) + if err != nil { c <- ExitStatus{ code: UnknownExitStatus, diff --git a/cmd/containerd-shim-runc-v2/task/service.go b/cmd/containerd-shim-runc-v2/task/service.go index 4ed7b9b2b1cc8..40cb2d9505849 100644 --- a/cmd/containerd-shim-runc-v2/task/service.go +++ b/cmd/containerd-shim-runc-v2/task/service.go @@ -198,6 +198,7 @@ func (s *service) preStart(c *runc.Container) (handleStarted func(*runc.Containe s.lifecycleMu.Unlock() for _, ee := range ees { s.handleProcessExit(ee, c, p) + log.G(context.Background()).Infof("nmx001 e is %v", ee) } } else { // Process start was successful, add to `s.running`. @@ -686,8 +687,10 @@ func (s *service) processExits() { for _, cp := range cps { if ip, ok := cp.Process.(*process.Init); ok { s.handleInitExit(e, cp.Container, ip) + log.G(context.Background()).Infof("nmx002 e is %v", e) } else { s.handleProcessExit(e, cp.Container, cp.Process) + log.G(context.Background()).Infof("nmx003 e is %v", e) } } } @@ -720,6 +723,7 @@ func (s *service) handleInitExit(e runcC.Exit, c *runc.Container, p *process.Ini if numRunningExecs == 0 { delete(s.runningExecs, c) s.lifecycleMu.Unlock() + log.G(context.Background()).Infof("nmx004 e is %v", e) s.handleProcessExit(e, c, p) return } @@ -746,6 +750,7 @@ func (s *service) handleInitExit(e runcC.Exit, c *runc.Container, p *process.Ini // all running processes have exited now, and no new // ones can start, so we can publish the init exit + log.G(context.Background()).Infof("nmx005 e is %v", e) s.handleProcessExit(e, c, p) }() } diff --git a/core/runtime/v2/shim.go b/core/runtime/v2/shim.go index d770e2812a9da..4962dd53c3ddd 100644 --- a/core/runtime/v2/shim.go +++ b/core/runtime/v2/shim.go @@ -25,6 +25,7 @@ import ( "net" "os" "path/filepath" + goruntime "runtime" "strings" "time" @@ -134,11 +135,16 @@ func loadShim(ctx context.Context, bundle *Bundle, onClose func()) (_ ShimInstan return shim, nil } +func stack() string { + var buf [2 << 10]byte + return string(buf[:goruntime.Stack(buf[:], true)]) +} + func cleanupAfterDeadShim(ctx context.Context, id string, rt *runtime.NSMap[ShimInstance], events *exchange.Exchange, binaryCall *binary) { ctx, cancel := timeout.WithContext(ctx, cleanupTimeout) defer cancel() - - log.G(ctx).WithField("id", id).Info("cleaning up after shim disconnected") + log.G(ctx).Infof("nmx666 %s", stack()) + log.G(ctx).WithField("id", id).Info("cleaning up after shim disconnected001") response, err := binaryCall.Delete(ctx) if err != nil { log.G(ctx).WithError(err).WithField("id", id).Warn("failed to clean up after shim disconnected") diff --git a/core/runtime/v2/shim_load.go b/core/runtime/v2/shim_load.go index 09a82710b787e..be10a2ca270b4 100644 --- a/core/runtime/v2/shim_load.go +++ b/core/runtime/v2/shim_load.go @@ -157,7 +157,7 @@ func (m *ShimManager) loadShim(ctx context.Context, bundle *Bundle) error { }) // TODO: It seems we can only call loadShim here if it is a sandbox shim? shim, err := loadShimTask(ctx, bundle, func() { - log.G(ctx).WithField("id", id).Info("shim disconnected") + log.G(ctx).WithField("id", id).Info("shim disconnected when loadShimTask") cleanupAfterDeadShim(context.WithoutCancel(ctx), id, m.shims, m.events, binaryCall) // Remove self from the runtime task list. diff --git a/core/runtime/v2/shim_manager.go b/core/runtime/v2/shim_manager.go index f61961d26d7b1..b4b4015ffe2fd 100644 --- a/core/runtime/v2/shim_manager.go +++ b/core/runtime/v2/shim_manager.go @@ -234,6 +234,11 @@ func (m *ShimManager) Start(ctx context.Context, id string, bundle *Bundle, opts return nil, fmt.Errorf("failed to write bootstrap.json for bundle %s: %w", bundle.Path, err) } + // onCloseWithShimLog := func() { + // onClose() + // cancelShimLog() + // f.Close() + // } shim, err := loadShim(ctx, bundle, func() {}) if err != nil { return nil, fmt.Errorf("failed to load sandbox task %q: %w", opts.SandboxID, err) @@ -287,7 +292,7 @@ func (m *ShimManager) startShim(ctx context.Context, bundle *Bundle, id string, env: m.env, }) shim, err := b.Start(ctx, typeurl.MarshalProto(topts), func() { - log.G(ctx).WithField("id", id).Info("shim disconnected") + log.G(ctx).WithField("id", id).Info("shim disconnected in startShim") cleanupAfterDeadShim(context.WithoutCancel(ctx), id, m.shims, m.events, b) // Remove self from the runtime task list. Even though the cleanupAfterDeadShim() diff --git a/integration/client/container_test.go b/integration/client/container_test.go index 8671f76ec77b1..330ed15219510 100644 --- a/integration/client/container_test.go +++ b/integration/client/container_test.go @@ -1927,6 +1927,7 @@ func TestContainerExecLargeOutputWithTTY(t *testing.T) { if err != nil { t.Fatal(err) } + t.Logf("container id is %s TestContainerExecLargeOutputWithTTY", container.ID()) defer container.Delete(ctx, WithSnapshotCleanup) task, err := container.NewTask(ctx, empty()) @@ -1944,7 +1945,8 @@ func TestContainerExecLargeOutputWithTTY(t *testing.T) { t.Fatal(err) } - for i := 0; i < 100; i++ { + for i := 0; i < 600; i++ { + t.Logf("test for %d times", i) spec, err := container.Spec(ctx) if err != nil { t.Fatal(err) @@ -1969,7 +1971,7 @@ func TestContainerExecLargeOutputWithTTY(t *testing.T) { if err := process.Start(ctx); err != nil { t.Fatal(err) } - + t.Logf("processPid %d", process.Pid()) // wait for the exec to return status := <-processStatusC code, _, err := status.Result() @@ -1986,8 +1988,13 @@ func TestContainerExecLargeOutputWithTTY(t *testing.T) { const expectedSuffix = "999999 1000000" stdoutString := stdout.String() + t.Logf("len (stdoutString) is %d", len(stdoutString)) + if len(stdoutString) == 0 { + t.Fatal(fmt.Errorf("len (stdoutString) is 0")) + } + t.Logf("the last 100 characters of the output:\n\n %q", stdoutString[len(stdoutString)-100:]) if !strings.Contains(stdoutString, expectedSuffix) { - t.Fatalf("process output does not end with %q at iteration %d, here are the last 20 characters of the output:\n\n %q", expectedSuffix, i, stdoutString[len(stdoutString)-20:]) + t.Fatalf("process output does not end with %q at iteration %d, here are the last 20 characters of the output:\n\n %q", expectedSuffix, i, stdoutString[len(stdoutString)-100:]) } } diff --git a/integration/sandbox_run_rollback_test.go b/integration/sandbox_run_rollback_test.go index 41d068f60ef23..b7a2ae1b66a78 100644 --- a/integration/sandbox_run_rollback_test.go +++ b/integration/sandbox_run_rollback_test.go @@ -157,13 +157,16 @@ func TestRunPodSandboxWithShimDeleteFailure(t *testing.T) { } t.Log("Cleanup leaky sandbox") + t.Logf("Cleanup leaky sandbox %s", sb.Id) err = runtimeService.RemovePodSandbox(sb.Id) require.NoError(t, err) } } - t.Run("CleanupAfterRestart", testCase(true)) - t.Run("JustCleanup", testCase(false)) + for i := 0; i < 20; i++ { + t.Run("CleanupAfterRestart", testCase(true)) + t.Run("JustCleanup", testCase(false)) + } } // TestRunPodSandboxWithShimStartAndTeardownCNIFailure should keep the sandbox diff --git a/internal/cri/server/events/events.go b/internal/cri/server/events/events.go index 655ab862e95b7..ddcc9e9fb201b 100644 --- a/internal/cri/server/events/events.go +++ b/internal/cri/server/events/events.go @@ -147,6 +147,7 @@ func (em *EventMonitor) Start() <-chan error { log.L.WithError(err).Errorf("Failed to convert event %+v", e) break } + if em.backOff.isInBackOff(id) { log.L.Infof("Events for %q is in backoff, enqueue event %+v", id, evt) em.backOff.enBackOff(id, evt) diff --git a/internal/cri/server/podsandbox/sandbox_stop.go b/internal/cri/server/podsandbox/sandbox_stop.go index 4d07e240c22a8..46555b93e9e66 100644 --- a/internal/cri/server/podsandbox/sandbox_stop.go +++ b/internal/cri/server/podsandbox/sandbox_stop.go @@ -18,6 +18,7 @@ package podsandbox import ( "context" + "errors" "fmt" "syscall" "time" @@ -107,9 +108,13 @@ func (c *Controller) stopSandboxContainer(ctx context.Context, podSandbox *types <-stopCh }() } - // Kill the pod sandbox container. if err = task.Kill(ctx, syscall.SIGKILL); err != nil && !errdefs.IsNotFound(err) { + // shim is already exited + log.G(ctx).Warnf("task %s err is %v", task.ID(), err) + if errors.Is(err, errors.New("ttrpc: closed")) { + log.G(ctx).Warnf("task %s ", task.ID()) + } return fmt.Errorf("failed to kill pod sandbox container: %w", err) } diff --git a/internal/cri/server/sandbox_run.go b/internal/cri/server/sandbox_run.go index 3142efb7252ac..a23787bdd51cb 100644 --- a/internal/cri/server/sandbox_run.go +++ b/internal/cri/server/sandbox_run.go @@ -159,6 +159,7 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox defer func() { if retErr != nil && cleanupErr == nil { cleanupErr = c.client.SandboxStore().Delete(ctx, id) + log.G(ctx).Infof("nmx6001 %s cleanupErr is %v", id, cleanupErr) } }() @@ -214,7 +215,9 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox defer func() { // Remove the network namespace only if all the resource cleanup is done if retErr != nil && cleanupErr == nil { - if cleanupErr = sandbox.NetNS.Remove(); cleanupErr != nil { + cleanupErr = sandbox.NetNS.Remove() + log.G(ctx).Infof("nmx6002 %s cleanupErr is %v", id, cleanupErr) + if cleanupErr != nil { log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id) return } @@ -238,7 +241,10 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox deferCtx, deferCancel := util.DeferContext() defer deferCancel() // Teardown network if an error is returned. - if cleanupErr = c.teardownPodNetwork(deferCtx, sandbox); cleanupErr != nil { + cleanupErr = c.teardownPodNetwork(deferCtx, sandbox) + log.G(ctx).Infof("nmx6003 %s cleanupErr is %v", id, cleanupErr) + if cleanupErr != nil { + log.G(ctx).Infof("nmx6003 %s cleanupErr is %v", id, cleanupErr) log.G(ctx).WithError(cleanupErr).Errorf("Failed to destroy network for sandbox %q", id) // ignoring failed to destroy networks when we failed to setup networks @@ -282,6 +288,7 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox var cerr podsandbox.CleanupErr if errors.As(err, &cerr) { cleanupErr = fmt.Errorf("failed to cleanup sandbox: %w", cerr) + log.G(ctx).Infof("nmx6004 %s cleanupErr is %v", id, cleanupErr) // Strip last error as cleanup error to handle separately if merr, ok := err.(interface{ Unwrap() []error }); ok {