Skip to content

Commit 1ee8850

Browse files
chore: Backport (03/04/2026) (#1264)
2 parents 27694c5 + d4ecd46 commit 1ee8850

File tree

9 files changed

+292
-33
lines changed

9 files changed

+292
-33
lines changed

.github/workflows/codespell.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ jobs:
1212
runs-on: ubuntu-latest
1313
steps:
1414
- name: Harden Runner
15-
uses: step-security/harden-runner@5ef0c079ce82195b2a36a210272d6b661572d83e # v2.14.2
15+
uses: step-security/harden-runner@a90bcbc6539c36a85cdfeb73f7e2f433735f215b # v2.15.0
1616
with:
1717
egress-policy: audit
1818

cmd/hubagent/main.go

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"k8s.io/apimachinery/pkg/runtime"
2929
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
3030
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
31+
"k8s.io/client-go/rest"
3132
"k8s.io/klog/v2"
3233
clusterinventory "sigs.k8s.io/cluster-inventory-api/apis/v1alpha1"
3334
ctrl "sigs.k8s.io/controller-runtime"
@@ -102,21 +103,46 @@ func main() {
102103
// Set up controller-runtime logger
103104
ctrl.SetLogger(zap.New(zap.UseDevMode(true)))
104105

105-
config := ctrl.GetConfigOrDie()
106-
config.QPS, config.Burst = float32(opts.CtrlMgrOpts.HubQPS), opts.CtrlMgrOpts.HubBurst
106+
// Create separate configs for the general access purpose and the leader election purpose.
107+
//
108+
// This aims to improve the availability of the hub agent; originally all access to the API
109+
// server would share the same rate limiting configuration, and under adverse conditions (e.g.,
110+
// large volume of concurrent placements) controllers would exhause all tokens in the rate limiter,
111+
// and effectively starve the leader election process (the runtime can no longer renew leases),
112+
// which would trigger the hub agent to restart even though the system remains functional.
113+
defaultCfg := ctrl.GetConfigOrDie()
114+
leaderElectionCfg := rest.CopyConfig(defaultCfg)
115+
116+
defaultCfg.QPS, defaultCfg.Burst = float32(opts.CtrlMgrOpts.HubQPS), opts.CtrlMgrOpts.HubBurst
117+
leaderElectionCfg.QPS, leaderElectionCfg.Burst = float32(opts.LeaderElectionOpts.LeaderElectionQPS), opts.LeaderElectionOpts.LeaderElectionBurst
107118

108119
mgrOpts := ctrl.Options{
109120
Scheme: scheme,
110121
Cache: cache.Options{
111122
SyncPeriod: &opts.CtrlMgrOpts.ResyncPeriod.Duration,
112123
DefaultTransform: cache.TransformStripManagedFields(),
113124
},
114-
LeaderElection: opts.LeaderElectionOpts.LeaderElect,
115-
LeaderElectionID: "136224848560.hub.fleet.azure.com",
116-
LeaderElectionNamespace: opts.LeaderElectionOpts.ResourceNamespace,
125+
LeaderElection: opts.LeaderElectionOpts.LeaderElect,
126+
LeaderElectionConfig: leaderElectionCfg,
127+
// If leader election is enabled, the hub agent by default uses a setup
128+
// with a lease duration of 60 secs, a renew deadline of 45 secs, and a retry period of 5 secs.
129+
// This setup gives the hub agent up to 9 attempts/45 seconds to renew its leadership lease
130+
// before it loses the leadership and restarts.
131+
//
132+
// These values are set significantly higher than the controller-runtime defaults
133+
// (15 seconds, 10 seconds, and 2 seconds respectively), as under heavy loads the hub agent
134+
// might have difficulty renewing its lease in time due to API server side latencies, which
135+
// might further lead to unexpected leadership losses (even when it is the only candidate
136+
// running) and restarts.
137+
//
138+
// Note (chenyu1): a minor side effect with the higher values is that when the agent does restart,
139+
// (or in the future when we do run multiple hub agent replicas), the new leader might have to wait a bit
140+
// longer (up to 60 seconds) to acquire the leadership, which should still be acceptable in most scenarios.
117141
LeaseDuration: &opts.LeaderElectionOpts.LeaseDuration.Duration,
118142
RenewDeadline: &opts.LeaderElectionOpts.RenewDeadline.Duration,
119143
RetryPeriod: &opts.LeaderElectionOpts.RetryPeriod.Duration,
144+
LeaderElectionID: "136224848560.hub.fleet.azure.com",
145+
LeaderElectionNamespace: opts.LeaderElectionOpts.ResourceNamespace,
120146
HealthProbeBindAddress: opts.CtrlMgrOpts.HealthProbeBindAddress,
121147
Metrics: metricsserver.Options{
122148
BindAddress: opts.CtrlMgrOpts.MetricsBindAddress,
@@ -129,7 +155,7 @@ func main() {
129155
if opts.CtrlMgrOpts.EnablePprof {
130156
mgrOpts.PprofBindAddress = fmt.Sprintf(":%d", opts.CtrlMgrOpts.PprofPort)
131157
}
132-
mgr, err := ctrl.NewManager(config, mgrOpts)
158+
mgr, err := ctrl.NewManager(defaultCfg, mgrOpts)
133159
if err != nil {
134160
klog.ErrorS(err, "unable to start controller manager.")
135161
exitWithErrorFunc()
@@ -187,7 +213,7 @@ func main() {
187213
}
188214

189215
ctx := ctrl.SetupSignalHandler()
190-
if err := workload.SetupControllers(ctx, &wg, mgr, config, opts); err != nil {
216+
if err := workload.SetupControllers(ctx, &wg, mgr, defaultCfg, opts); err != nil {
191217
klog.ErrorS(err, "unable to set up controllers")
192218
exitWithErrorFunc()
193219
}

cmd/hubagent/options/leaderelection.go

Lines changed: 87 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ package options
1818

1919
import (
2020
"flag"
21+
"fmt"
22+
"strconv"
2123
"time"
2224

2325
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -53,6 +55,18 @@ type LeaderElectionOptions struct {
5355
// The namespace of the resource object that will be used to lock during leader election cycles.
5456
// This option only applies if leader election is enabled.
5557
ResourceNamespace string
58+
59+
// The QPS limit set to the rate limiter of the Kubernetes client in use by the controller manager
60+
// for leader election purposes. This sets up a separate client-side throttling mechanism specifically
61+
// for lease related operations, mostly to avoid an adverse situation where normal operations
62+
// in the controller manager starve the lease related operations, and thus cause unexpected leadership losses.
63+
LeaderElectionQPS float64
64+
65+
// The burst limit set to the rate limiter of the Kubernetes client in use by the controller manager
66+
// for leader election purposes. This sets up a separate client-side throttling mechanism specifically
67+
// for lease related operations, mostly to avoid an adverse situation where normal operations
68+
// in the controller manager starve the lease related operations, and thus cause unexpected leadership losses.
69+
LeaderElectionBurst int
5670
}
5771

5872
// AddFlags adds flags for LeaderElectionOptions to the specified FlagSet.
@@ -69,23 +83,23 @@ func (o *LeaderElectionOptions) AddFlags(flags *flag.FlagSet) {
6983
flags.DurationVar(
7084
&o.LeaseDuration.Duration,
7185
"leader-lease-duration",
72-
15*time.Second,
86+
60*time.Second,
7387
"The duration of a leader election lease. This is the period where a non-leader candidate will wait after observing a leadership renewal before attempting to acquire leadership of the current leader. And it is also effectively the maximum duration that a leader can be stopped before it is replaced by another candidate. The option only applies if leader election is enabled.",
7488
)
7589

7690
// This input is sent to the controller manager for validation; no further check here.
7791
flags.DurationVar(
7892
&o.RenewDeadline.Duration,
7993
"leader-renew-deadline",
80-
10*time.Second,
94+
45*time.Second,
8195
"The interval between attempts by the acting master to renew a leadership slot before it stops leading. This must be less than or equal to the lease duration. The option only applies if leader election is enabled",
8296
)
8397

8498
// This input is sent to the controller manager for validation; no further check here.
8599
flags.DurationVar(
86100
&o.RetryPeriod.Duration,
87101
"leader-retry-period",
88-
2*time.Second,
102+
5*time.Second,
89103
"The duration the clients should wait between attempting acquisition and renewal of a leadership. The option only applies if leader election is enabled",
90104
)
91105

@@ -96,4 +110,74 @@ func (o *LeaderElectionOptions) AddFlags(flags *flag.FlagSet) {
96110
utils.FleetSystemNamespace,
97111
"The namespace of the resource object that will be used to lock during leader election cycles. The option only applies if leader election is enabled.",
98112
)
113+
114+
flags.Var(
115+
newLeaderElectionQPSValueWithValidation(250, &o.LeaderElectionQPS),
116+
"leader-election-qps",
117+
"The QPS limit set to the rate limiter of the Kubernetes client in use by the controller manager for leader election purposes. This sets up a separate client-side throttling mechanism specifically for lease related operations, mostly to avoid an adverse situation where normal operations in the controller manager starve the lease related operations, and thus cause unexpected leadership losses. Defaults to 250. Use a positive float64 value in the range [10.0, 1000.0], or set a less or equal to zero value to disable client-side throttling.")
118+
119+
flags.Var(
120+
newLeaderElectionBurstValueWithValidation(1000, &o.LeaderElectionBurst),
121+
"leader-election-burst",
122+
"The burst limit set to the rate limiter of the Kubernetes client in use by the controller manager for leader election purposes. This sets up a separate client-side throttling mechanism specifically for lease related operations, mostly to avoid an adverse situation where normal operations in the controller manager starve the lease related operations, and thus cause unexpected leadership losses. Defaults to 1000. Use a positive int value in the range [10, 2000].")
123+
}
124+
125+
// A list of flag variables that allow pluggable validation logic when parsing the input args.
126+
127+
type LeaderElectionQPSValueWithValidation float64
128+
129+
func (v *LeaderElectionQPSValueWithValidation) String() string {
130+
return fmt.Sprintf("%f", *v)
131+
}
132+
133+
func (v *LeaderElectionQPSValueWithValidation) Set(s string) error {
134+
// Some validation is also performed on the controller manager side and the client-go side. Just
135+
// to be on the safer side we also impose some limits here.
136+
qps, err := strconv.ParseFloat(s, 64)
137+
if err != nil {
138+
return fmt.Errorf("failed to parse float64 value: %w", err)
139+
}
140+
141+
if qps <= 0.0 {
142+
// Disable client-side throttling.
143+
*v = -1.0
144+
return nil
145+
}
146+
147+
if qps < 10.0 || qps > 1000.0 {
148+
return fmt.Errorf("QPS limit is set to an invalid value (%f), must be a value in the range [10.0, 1000.0]", qps)
149+
}
150+
*v = LeaderElectionQPSValueWithValidation(qps)
151+
return nil
152+
}
153+
154+
func newLeaderElectionQPSValueWithValidation(defaultVal float64, p *float64) *LeaderElectionQPSValueWithValidation {
155+
*p = defaultVal
156+
return (*LeaderElectionQPSValueWithValidation)(p)
157+
}
158+
159+
type LeaderElectionBurstValueWithValidation int
160+
161+
func (v *LeaderElectionBurstValueWithValidation) String() string {
162+
return fmt.Sprintf("%d", *v)
163+
}
164+
165+
func (v *LeaderElectionBurstValueWithValidation) Set(s string) error {
166+
// Some validation is also performed on the controller manager side and the client-go side. Just
167+
// to be on the safer side we also impose some limits here.
168+
burst, err := strconv.Atoi(s)
169+
if err != nil {
170+
return fmt.Errorf("failed to parse int value: %w", err)
171+
}
172+
173+
if burst < 10 || burst > 2000 {
174+
return fmt.Errorf("burst limit is set to an invalid value (%d), must be a value in the range [10, 2000]", burst)
175+
}
176+
*v = LeaderElectionBurstValueWithValidation(burst)
177+
return nil
178+
}
179+
180+
func newLeaderElectionBurstValueWithValidation(defaultVal int, p *int) *LeaderElectionBurstValueWithValidation {
181+
*p = defaultVal
182+
return (*LeaderElectionBurstValueWithValidation)(p)
99183
}

cmd/hubagent/options/options_test.go

Lines changed: 72 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,13 @@ func TestLeaderElectionOpts(t *testing.T) {
4343
flagSetName: "allDefault",
4444
args: []string{},
4545
wantLeaderElectionOpts: LeaderElectionOptions{
46-
LeaderElect: false,
47-
LeaseDuration: metav1.Duration{Duration: 15 * time.Second},
48-
RenewDeadline: metav1.Duration{Duration: 10 * time.Second},
49-
RetryPeriod: metav1.Duration{Duration: 2 * time.Second},
50-
ResourceNamespace: utils.FleetSystemNamespace,
46+
LeaderElect: false,
47+
LeaseDuration: metav1.Duration{Duration: 60 * time.Second},
48+
RenewDeadline: metav1.Duration{Duration: 45 * time.Second},
49+
RetryPeriod: metav1.Duration{Duration: 5 * time.Second},
50+
ResourceNamespace: utils.FleetSystemNamespace,
51+
LeaderElectionQPS: 250.0,
52+
LeaderElectionBurst: 1000,
5153
},
5254
},
5355
{
@@ -59,15 +61,75 @@ func TestLeaderElectionOpts(t *testing.T) {
5961
"--leader-renew-deadline=20s",
6062
"--leader-retry-period=5s",
6163
"--leader-election-namespace=test-namespace",
64+
"--leader-election-qps=500",
65+
"--leader-election-burst=1500",
6266
},
6367
wantLeaderElectionOpts: LeaderElectionOptions{
64-
LeaderElect: true,
65-
LeaseDuration: metav1.Duration{Duration: 30 * time.Second},
66-
RenewDeadline: metav1.Duration{Duration: 20 * time.Second},
67-
RetryPeriod: metav1.Duration{Duration: 5 * time.Second},
68-
ResourceNamespace: "test-namespace",
68+
LeaderElect: true,
69+
LeaseDuration: metav1.Duration{Duration: 30 * time.Second},
70+
RenewDeadline: metav1.Duration{Duration: 20 * time.Second},
71+
RetryPeriod: metav1.Duration{Duration: 5 * time.Second},
72+
ResourceNamespace: "test-namespace",
73+
LeaderElectionQPS: 500.0,
74+
LeaderElectionBurst: 1500,
6975
},
7076
},
77+
{
78+
name: "negative leader election QPS value",
79+
flagSetName: "qpsNegative",
80+
args: []string{"--leader-election-qps=-5"},
81+
wantLeaderElectionOpts: LeaderElectionOptions{
82+
LeaderElect: false,
83+
LeaseDuration: metav1.Duration{Duration: 60 * time.Second},
84+
RenewDeadline: metav1.Duration{Duration: 45 * time.Second},
85+
RetryPeriod: metav1.Duration{Duration: 5 * time.Second},
86+
ResourceNamespace: utils.FleetSystemNamespace,
87+
LeaderElectionQPS: -1,
88+
LeaderElectionBurst: 1000,
89+
},
90+
},
91+
{
92+
name: "leader election QPS parse error",
93+
flagSetName: "qpsParseError",
94+
args: []string{"--leader-election-qps=abc"},
95+
wantErred: true,
96+
wantErrMsgSubStr: "failed to parse float64 value",
97+
},
98+
{
99+
name: "leader election QPS out of range (too small)",
100+
flagSetName: "qpsOutOfRangeTooSmall",
101+
args: []string{"--leader-election-qps=9.9"},
102+
wantErred: true,
103+
wantErrMsgSubStr: "QPS limit is set to an invalid value",
104+
},
105+
{
106+
name: "leader election QPS out of range (too large)",
107+
flagSetName: "qpsOutOfRangeTooLarge",
108+
args: []string{"--leader-election-qps=1000.1"},
109+
wantErred: true,
110+
wantErrMsgSubStr: "QPS limit is set to an invalid value",
111+
},
112+
{
113+
name: "leader election burst parse error",
114+
flagSetName: "burstParseError",
115+
args: []string{"--leader-election-burst=abc"},
116+
wantErred: true,
117+
wantErrMsgSubStr: "failed to parse int value",
118+
},
119+
{
120+
name: "leader election burst out of range (too small)",
121+
flagSetName: "burstOutOfRangeTooSmall",
122+
args: []string{"--leader-election-burst=9"},
123+
wantErred: true,
124+
wantErrMsgSubStr: "burst limit is set to an invalid value",
125+
},
126+
{
127+
name: "leader election burst out of range (too large)",
128+
flagSetName: "burstOutOfRangeTooLarge",
129+
args: []string{"--leader-election-burst=2001"},
130+
wantErred: true,
131+
wantErrMsgSubStr: "burst limit is set to an invalid value",
132+
},
71133
}
72134

73135
for _, tc := range testCases {

cmd/hubagent/options/validation.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,11 @@ func (o *Options) Validate() field.ErrorList {
3333
errs = append(errs, field.Invalid(newPath.Child("HubBurst"), o.CtrlMgrOpts.HubBurst, "The burst limit for client-side throttling must be greater than or equal to its QPS limit"))
3434
}
3535

36+
// Cross-field validation for leader election options.
37+
if float64(o.LeaderElectionOpts.LeaderElectionBurst) < float64(o.LeaderElectionOpts.LeaderElectionQPS) {
38+
errs = append(errs, field.Invalid(newPath.Child("LeaderElectionBurst"), o.LeaderElectionOpts.LeaderElectionBurst, "The burst limit for client-side throttling of leader election related operations must be greater than or equal to its QPS limit"))
39+
}
40+
3641
// Cross-field validation for webhook options.
3742

3843
// Note: this validation logic is a bit weird in the sense that the system accepts

cmd/hubagent/options/validation_test.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ func newTestOptions(modifyOptions ModifyOptions) Options {
4040
HubQPS: 250,
4141
HubBurst: 1000,
4242
},
43+
LeaderElectionOpts: LeaderElectionOptions{
44+
LeaderElectionQPS: 250.0,
45+
LeaderElectionBurst: 1000,
46+
},
4347
WebhookOpts: WebhookOptions{
4448
ClientConnectionType: "url",
4549
ServiceName: testWebhookServiceName,
@@ -78,6 +82,13 @@ func TestValidateControllerManagerConfiguration(t *testing.T) {
7882
}),
7983
want: field.ErrorList{field.Invalid(newPath.Child("HubBurst"), 50, "The burst limit for client-side throttling must be greater than or equal to its QPS limit")},
8084
},
85+
"invalid: leader election burst value is less than its QPS value": {
86+
opt: newTestOptions(func(option *Options) {
87+
option.LeaderElectionOpts.LeaderElectionQPS = 100
88+
option.LeaderElectionOpts.LeaderElectionBurst = 50
89+
}),
90+
want: field.ErrorList{field.Invalid(newPath.Child("LeaderElectionBurst"), 50, "The burst limit for client-side throttling of leader election related operations must be greater than or equal to its QPS limit")},
91+
},
8192
"WebhookServiceName is empty": {
8293
opt: newTestOptions(func(option *Options) {
8394
option.WebhookOpts.EnableWebhooks = true

pkg/controllers/updaterun/initialization.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -623,9 +623,8 @@ func (r *Reconciler) getResourceSnapshotObjs(ctx context.Context, placement plac
623623
}
624624

625625
if latestResourceSnapshot == nil {
626-
err := fmt.Errorf("no resource snapshot created for placement `%s`", placementKey)
627-
klog.ErrorS(err, "Failed to create resource snapshot", "placement", placementKey, "updateRun", updateRunRef)
628-
return nil, err
626+
err := fmt.Errorf("no resource snapshot created for placement `%s` but there is no error returned by getOrCreate", placementKey)
627+
return nil, controller.NewUnexpectedBehaviorError(err)
629628
}
630629

631630
// Return the master snapshot directly rather than listing from the cache, because

0 commit comments

Comments
 (0)