ningmingxiao · ningmingxiao · Jun 21, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -602,8 +602,8 @@ jobs:
             sudo BOX=$BOX RUNC_FLAVOR=$RUNC_FLAVOR vagrant halt
             sudo BOX=$BOX RUNC_FLAVOR=$RUNC_FLAVOR vagrant up
           fi
-      - name: test-integration
-        run: sudo BOX=$BOX RUNC_FLAVOR=$RUNC_FLAVOR vagrant up --provision-with=selinux,install-runc,install-gotestsum,test-integration
+      # - name: test-integration
+      #   run: sudo BOX=$BOX RUNC_FLAVOR=$RUNC_FLAVOR vagrant up --provision-with=selinux,install-runc,install-gotestsum,test-integration
       - name: test-cri-integration
         run: sudo BOX=$BOX CGROUP_DRIVER=$CGROUP_DRIVER RUNC_FLAVOR=$RUNC_FLAVOR vagrant up --provision-with=selinux,install-runc,install-gotestsum,test-cri-integration
       - name: test-cri

diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,16 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "debug",
+            "type": "go",
+            "request": "launch",
+            "mode": "auto",
+            "env": {"GO111MODULE":"on"},
+             "program": "${workspaceFolder}/cmd/containerd"
+        }
+    ]
+}
diff --git a/client/process.go b/client/process.go
@@ -178,6 +178,7 @@ func (p *process) Wait(ctx context.Context) (<-chan ExitStatus, error) {
 			ContainerID: p.task.id,
 			ExecID:      p.id,
 		})
+
 		if err != nil {
 			c <- ExitStatus{
 				code: UnknownExitStatus,

diff --git a/cmd/containerd-shim-runc-v2/task/service.go b/cmd/containerd-shim-runc-v2/task/service.go
@@ -198,6 +198,7 @@ func (s *service) preStart(c *runc.Container) (handleStarted func(*runc.Containe
 			s.lifecycleMu.Unlock()
 			for _, ee := range ees {
 				s.handleProcessExit(ee, c, p)
+				log.G(context.Background()).Infof("nmx001 e is %v", ee)
 			}
 		} else {
 			// Process start was successful, add to `s.running`.
@@ -686,8 +687,10 @@ func (s *service) processExits() {
 		for _, cp := range cps {
 			if ip, ok := cp.Process.(*process.Init); ok {
 				s.handleInitExit(e, cp.Container, ip)
+				log.G(context.Background()).Infof("nmx002 e is %v", e)
 			} else {
 				s.handleProcessExit(e, cp.Container, cp.Process)
+				log.G(context.Background()).Infof("nmx003 e is %v", e)
 			}
 		}
 	}
@@ -720,6 +723,7 @@ func (s *service) handleInitExit(e runcC.Exit, c *runc.Container, p *process.Ini
 	if numRunningExecs == 0 {
 		delete(s.runningExecs, c)
 		s.lifecycleMu.Unlock()
+		log.G(context.Background()).Infof("nmx004 e is %v", e)
 		s.handleProcessExit(e, c, p)
 		return
 	}
@@ -746,6 +750,7 @@ func (s *service) handleInitExit(e runcC.Exit, c *runc.Container, p *process.Ini
 
 		// all running processes have exited now, and no new
 		// ones can start, so we can publish the init exit
+		log.G(context.Background()).Infof("nmx005 e is %v", e)
 		s.handleProcessExit(e, c, p)
 	}()
 }

diff --git a/core/runtime/v2/shim.go b/core/runtime/v2/shim.go
@@ -25,6 +25,7 @@ import (
 	"net"
 	"os"
 	"path/filepath"
+	goruntime "runtime"
 	"strings"
 	"time"
 
@@ -134,11 +135,16 @@ func loadShim(ctx context.Context, bundle *Bundle, onClose func()) (_ ShimInstan
 	return shim, nil
 }
 
+func stack() string {
+	var buf [2 << 10]byte
+	return string(buf[:goruntime.Stack(buf[:], true)])
+}
+
 func cleanupAfterDeadShim(ctx context.Context, id string, rt *runtime.NSMap[ShimInstance], events *exchange.Exchange, binaryCall *binary) {
 	ctx, cancel := timeout.WithContext(ctx, cleanupTimeout)
 	defer cancel()
-
-	log.G(ctx).WithField("id", id).Info("cleaning up after shim disconnected")
+	log.G(ctx).Infof("nmx666 %s", stack())
+	log.G(ctx).WithField("id", id).Info("cleaning up after shim disconnected001")
 	response, err := binaryCall.Delete(ctx)
 	if err != nil {
 		log.G(ctx).WithError(err).WithField("id", id).Warn("failed to clean up after shim disconnected")

diff --git a/core/runtime/v2/shim_load.go b/core/runtime/v2/shim_load.go
@@ -157,7 +157,7 @@ func (m *ShimManager) loadShim(ctx context.Context, bundle *Bundle) error {
 		})
 	// TODO: It seems we can only call loadShim here if it is a sandbox shim?
 	shim, err := loadShimTask(ctx, bundle, func() {
-		log.G(ctx).WithField("id", id).Info("shim disconnected")
+		log.G(ctx).WithField("id", id).Info("shim disconnected when loadShimTask")
 
 		cleanupAfterDeadShim(context.WithoutCancel(ctx), id, m.shims, m.events, binaryCall)
 		// Remove self from the runtime task list.

diff --git a/core/runtime/v2/shim_manager.go b/core/runtime/v2/shim_manager.go
@@ -234,6 +234,11 @@ func (m *ShimManager) Start(ctx context.Context, id string, bundle *Bundle, opts
 			return nil, fmt.Errorf("failed to write bootstrap.json for bundle %s: %w", bundle.Path, err)
 		}
 
+		// onCloseWithShimLog := func() {
+		// 	onClose()
+		// 	cancelShimLog()
+		// 	f.Close()
+		// }
 		shim, err := loadShim(ctx, bundle, func() {})
 		if err != nil {
 			return nil, fmt.Errorf("failed to load sandbox task %q: %w", opts.SandboxID, err)
@@ -287,7 +292,7 @@ func (m *ShimManager) startShim(ctx context.Context, bundle *Bundle, id string,
 		env:          m.env,
 	})
 	shim, err := b.Start(ctx, typeurl.MarshalProto(topts), func() {
-		log.G(ctx).WithField("id", id).Info("shim disconnected")
+		log.G(ctx).WithField("id", id).Info("shim disconnected in startShim")
 
 		cleanupAfterDeadShim(context.WithoutCancel(ctx), id, m.shims, m.events, b)
 		// Remove self from the runtime task list. Even though the cleanupAfterDeadShim()

diff --git a/integration/client/container_test.go b/integration/client/container_test.go
@@ -1927,6 +1927,7 @@ func TestContainerExecLargeOutputWithTTY(t *testing.T) {
 	if err != nil {
 		t.Fatal(err)
 	}
+	t.Logf("container id is %s TestContainerExecLargeOutputWithTTY", container.ID())
 	defer container.Delete(ctx, WithSnapshotCleanup)
 
 	task, err := container.NewTask(ctx, empty())
@@ -1944,7 +1945,8 @@ func TestContainerExecLargeOutputWithTTY(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	for i := 0; i < 100; i++ {
+	for i := 0; i < 600; i++ {
+		t.Logf("test for %d times", i)
 		spec, err := container.Spec(ctx)
 		if err != nil {
 			t.Fatal(err)
@@ -1969,7 +1971,7 @@ func TestContainerExecLargeOutputWithTTY(t *testing.T) {
 		if err := process.Start(ctx); err != nil {
 			t.Fatal(err)
 		}
-
+		t.Logf("processPid %d", process.Pid())
 		// wait for the exec to return
 		status := <-processStatusC
 		code, _, err := status.Result()
@@ -1986,8 +1988,13 @@ func TestContainerExecLargeOutputWithTTY(t *testing.T) {
 
 		const expectedSuffix = "999999 1000000"
 		stdoutString := stdout.String()
+		t.Logf("len (stdoutString) is %d", len(stdoutString))
+		if len(stdoutString) == 0 {
+			t.Fatal(fmt.Errorf("len (stdoutString) is 0"))
+		}
+		t.Logf("the last 100 characters of the output:\n\n %q", stdoutString[len(stdoutString)-100:])
 		if !strings.Contains(stdoutString, expectedSuffix) {
-			t.Fatalf("process output does not end with %q at iteration %d, here are the last 20 characters of the output:\n\n %q", expectedSuffix, i, stdoutString[len(stdoutString)-20:])
+			t.Fatalf("process output does not end with %q at iteration %d, here are the last 20 characters of the output:\n\n %q", expectedSuffix, i, stdoutString[len(stdoutString)-100:])
 		}
 
 	}

diff --git a/integration/sandbox_run_rollback_test.go b/integration/sandbox_run_rollback_test.go
@@ -157,13 +157,16 @@ func TestRunPodSandboxWithShimDeleteFailure(t *testing.T) {
 			}
 
 			t.Log("Cleanup leaky sandbox")
+			t.Logf("Cleanup leaky sandbox %s", sb.Id)
 			err = runtimeService.RemovePodSandbox(sb.Id)
 			require.NoError(t, err)
 		}
 	}
 
-	t.Run("CleanupAfterRestart", testCase(true))
-	t.Run("JustCleanup", testCase(false))
+	for i := 0; i < 20; i++ {
+		t.Run("CleanupAfterRestart", testCase(true))
+		t.Run("JustCleanup", testCase(false))
+	}
 }
 
 // TestRunPodSandboxWithShimStartAndTeardownCNIFailure should keep the sandbox

diff --git a/internal/cri/server/events/events.go b/internal/cri/server/events/events.go
@@ -147,6 +147,7 @@ func (em *EventMonitor) Start() <-chan error {
 					log.L.WithError(err).Errorf("Failed to convert event %+v", e)
 					break
 				}
+
 				if em.backOff.isInBackOff(id) {
 					log.L.Infof("Events for %q is in backoff, enqueue event %+v", id, evt)
 					em.backOff.enBackOff(id, evt)

diff --git a/internal/cri/server/podsandbox/sandbox_stop.go b/internal/cri/server/podsandbox/sandbox_stop.go
@@ -18,6 +18,7 @@ package podsandbox
 
 import (
 	"context"
+	"errors"
 	"fmt"
 	"syscall"
 	"time"
@@ -107,9 +108,13 @@ func (c *Controller) stopSandboxContainer(ctx context.Context, podSandbox *types
 			<-stopCh
 		}()
 	}
-
 	// Kill the pod sandbox container.
 	if err = task.Kill(ctx, syscall.SIGKILL); err != nil && !errdefs.IsNotFound(err) {
+		// shim is already exited
+		log.G(ctx).Warnf("task %s err is %v", task.ID(), err)
+		if errors.Is(err, errors.New("ttrpc: closed")) {
+			log.G(ctx).Warnf("task %s ", task.ID())
+		}
 		return fmt.Errorf("failed to kill pod sandbox container: %w", err)
 	}
 

diff --git a/internal/cri/server/sandbox_run.go b/internal/cri/server/sandbox_run.go
@@ -159,6 +159,7 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
 	defer func() {
 		if retErr != nil && cleanupErr == nil {
 			cleanupErr = c.client.SandboxStore().Delete(ctx, id)
+			log.G(ctx).Infof("nmx6001 %s cleanupErr is %v", id, cleanupErr)
 		}
 	}()
 
@@ -214,7 +215,9 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
 		defer func() {
 			// Remove the network namespace only if all the resource cleanup is done
 			if retErr != nil && cleanupErr == nil {
-				if cleanupErr = sandbox.NetNS.Remove(); cleanupErr != nil {
+				cleanupErr = sandbox.NetNS.Remove()
+				log.G(ctx).Infof("nmx6002 %s cleanupErr is %v", id, cleanupErr)
+				if cleanupErr != nil {
 					log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id)
 					return
 				}
@@ -238,7 +241,10 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
 				deferCtx, deferCancel := util.DeferContext()
 				defer deferCancel()
 				// Teardown network if an error is returned.
-				if cleanupErr = c.teardownPodNetwork(deferCtx, sandbox); cleanupErr != nil {
+				cleanupErr = c.teardownPodNetwork(deferCtx, sandbox)
+				log.G(ctx).Infof("nmx6003 %s cleanupErr is %v", id, cleanupErr)
+				if cleanupErr != nil {
+					log.G(ctx).Infof("nmx6003 %s cleanupErr is %v", id, cleanupErr)
 					log.G(ctx).WithError(cleanupErr).Errorf("Failed to destroy network for sandbox %q", id)
 
 					// ignoring failed to destroy networks when we failed to setup networks
@@ -282,6 +288,7 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
 		var cerr podsandbox.CleanupErr
 		if errors.As(err, &cerr) {
 			cleanupErr = fmt.Errorf("failed to cleanup sandbox: %w", cerr)
+			log.G(ctx).Infof("nmx6004 %s cleanupErr is %v", id, cleanupErr)
 
 			// Strip last error as cleanup error to handle separately
 			if merr, ok := err.(interface{ Unwrap() []error }); ok {