Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cmd/scheduler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"fmt"
"net/http"
"net/http/pprof"
"time"

"github.com/julienschmidt/httprouter"
"github.com/spf13/cobra"
Expand All @@ -32,6 +33,7 @@ import (
"github.com/Project-HAMi/HAMi/pkg/util"
"github.com/Project-HAMi/HAMi/pkg/util/client"
"github.com/Project-HAMi/HAMi/pkg/util/flag"
"github.com/Project-HAMi/HAMi/pkg/util/nodelock"
"github.com/Project-HAMi/HAMi/pkg/version"
)

Expand Down Expand Up @@ -72,6 +74,7 @@ func init() {
rootCmd.Flags().IntVar(&config.Burst, "kube-burst", client.DefaultBurst, "Burst to use while talking with kube-apiserver.")
rootCmd.Flags().IntVar(&config.Timeout, "kube-timeout", client.DefaultTimeout, "Timeout to use while talking with kube-apiserver.")
rootCmd.Flags().BoolVar(&enableProfiling, "profiling", false, "Enable pprof profiling via HTTP server")
rootCmd.Flags().DurationVar(&config.NodeLockTimeout, "node-lock-timeout", time.Minute*5, "timeout for node locks")

rootCmd.PersistentFlags().AddGoFlagSet(device.GlobalFlagSet())
rootCmd.AddCommand(version.VersionCmd)
Expand Down Expand Up @@ -99,6 +102,9 @@ func injectProfilingRoute(router *httprouter.Router) {
}

func start() error {
// Initialize node lock timeout from config
nodelock.NodeLockTimeout = config.NodeLockTimeout
klog.InfoS("Set node lock timeout", "timeout", nodelock.NodeLockTimeout)
client.InitGlobalClient(
client.WithBurst(config.Burst),
client.WithQPS(config.QPS),
Expand Down
9 changes: 8 additions & 1 deletion pkg/scheduler/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,11 @@ limitations under the License.

package config

import "github.com/Project-HAMi/HAMi/pkg/util"
import (
"time"

"github.com/Project-HAMi/HAMi/pkg/util"
)

var (
QPS float32
Expand All @@ -37,4 +41,7 @@ var (

// NodeLabelSelector is scheduler filter node by node label.
NodeLabelSelector map[string]string

// NodeLockTimeout is the timeout for node locks.
NodeLockTimeout time.Duration
)
12 changes: 8 additions & 4 deletions pkg/util/nodelock/nodelock.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,11 @@ const (
NodeLockSep = ","
)

var lock sync.Mutex
var (
lock sync.Mutex
// NodeLockTimeout is the global timeout for node locks.
NodeLockTimeout time.Duration = time.Minute * 5
)

func SetNodeLock(nodeName string, lockname string, pods *corev1.Pod) error {
lock.Lock()
Expand Down Expand Up @@ -122,16 +126,16 @@ func LockNode(nodeName string, lockname string, pods *corev1.Pod) error {
if err != nil {
return err
}
if time.Since(lockTime) > time.Minute*5 {
klog.InfoS("Node lock expired", "node", nodeName, "lockTime", lockTime)
if time.Since(lockTime) > NodeLockTimeout {
klog.InfoS("Node lock expired", "node", nodeName, "lockTime", lockTime, "timeout", NodeLockTimeout)
err = ReleaseNodeLock(nodeName, lockname, pods, true)
if err != nil {
klog.ErrorS(err, "Failed to release node lock", "node", nodeName)
return err
}
return SetNodeLock(nodeName, lockname, pods)
}
return fmt.Errorf("node %s has been locked within 5 minutes", nodeName)
return fmt.Errorf("node %s has been locked within %v", nodeName, NodeLockTimeout)
}

func ParseNodeLock(value string) (lockTime time.Time, ns, name string, err error) {
Expand Down
49 changes: 49 additions & 0 deletions pkg/util/nodelock/nodelock_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ package nodelock

import (
"context"
"strings"
"testing"
"time"

corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -133,6 +135,53 @@ func Test_LockNode(t *testing.T) {
}
}

// TestLockNodeWithTimeout tests the specific case where line 130 is executed
func TestLockNodeWithTimeout(t *testing.T) {
client.KubeClient = fake.NewSimpleClientset()

// Set a custom timeout for testing
originalTimeout := NodeLockTimeout
NodeLockTimeout = time.Minute * 2
defer func() {
NodeLockTimeout = originalTimeout
}()

nodeName := "test-node-timeout"

// Create a node with a fresh lock (should not be expired)
freshLockTime := time.Now().Format(time.RFC3339)
lockValue := freshLockTime + NodeLockSep + "test-ns" + NodeLockSep + "test-pod"

client.KubeClient.CoreV1().Nodes().Create(context.TODO(), &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: nodeName,
Annotations: map[string]string{
NodeLockKey: lockValue,
},
},
}, metav1.CreateOptions{})

pod := &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "new-pod",
Namespace: "new-ns",
},
}

// Try to lock the node again - this should trigger line 130
err := LockNode(nodeName, "", pod)

// Verify the error contains the NodeLockTimeout value
if err == nil {
t.Fatal("Expected error but got nil")
}

expectedError := "has been locked within 2m0s"
if !strings.Contains(err.Error(), expectedError) {
t.Errorf("Expected error to contain '%s', but got: %v", expectedError, err)
}
}

func TestReleaseNodeLock(t *testing.T) {
client.KubeClient = fake.NewSimpleClientset()
type args struct {
Expand Down