Skip to content

Commit db077d2

Browse files
author
Eduard Agarici
committed
pr review
1 parent b004129 commit db077d2

File tree

9 files changed

+461
-226
lines changed

9 files changed

+461
-226
lines changed

api/v1beta1/common_types.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,16 @@ type VolumeState struct {
222222
CruiseControlOperationReference *corev1.LocalObjectReference `json:"cruiseControlOperationReference,omitempty"`
223223
}
224224

225+
// CacheResizeState tracks the resize lifecycle of a tiered storage cache PVC for a given mount path.
226+
type CacheResizeState string
227+
228+
const (
229+
// CacheResizePendingDeletion indicates that the old cache PVC at this mount path is waiting
230+
// to be deleted once the broker pod stops. A replacement PVC with the new desired size has
231+
// already been created at the same mount path.
232+
CacheResizePendingDeletion CacheResizeState = "pending-deletion"
233+
)
234+
225235
// BrokerState holds information about broker state
226236
type BrokerState struct {
227237
// RackAwarenessState holds info about rack awareness status
@@ -240,6 +250,9 @@ type BrokerState struct {
240250
Image string `json:"image,omitempty"`
241251
// Compressed data from broker configuration to restore broker pod in specific cases
242252
ConfigurationBackup string `json:"configurationBackup,omitempty"`
253+
// CacheVolumeStates tracks in-flight tiered storage cache PVC resize operations, keyed by mount path.
254+
// An entry is present only while a resize is in progress; it is cleared once cleanup completes.
255+
CacheVolumeStates map[string]CacheResizeState `json:"cacheVolumeStates,omitempty"`
243256
}
244257

245258
const (

charts/kafka-operator/crds/kafkaclusters.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23825,6 +23825,16 @@ spec:
2382523825
additionalProperties:
2382623826
description: BrokerState holds information about broker state
2382723827
properties:
23828+
cacheVolumeStates:
23829+
additionalProperties:
23830+
description: CacheResizeState tracks the resize lifecycle of
23831+
a tiered storage cache PVC for a given mount path.
23832+
type: string
23833+
description: CacheVolumeStates tracks in-flight tiered storage
23834+
cache PVC resize operations, keyed by mount path. An entry
23835+
is present only while a resize is in progress; it is cleared
23836+
once cleanup completes.
23837+
type: object
2382823838
configurationBackup:
2382923839
description: Compressed data from broker configuration to restore
2383023840
broker pod in specific cases

config/base/crds/kafka.banzaicloud.io_kafkaclusters.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23825,6 +23825,16 @@ spec:
2382523825
additionalProperties:
2382623826
description: BrokerState holds information about broker state
2382723827
properties:
23828+
cacheVolumeStates:
23829+
additionalProperties:
23830+
description: CacheResizeState tracks the resize lifecycle of
23831+
a tiered storage cache PVC for a given mount path.
23832+
type: string
23833+
description: CacheVolumeStates tracks in-flight tiered storage
23834+
cache PVC resize operations, keyed by mount path. An entry
23835+
is present only while a resize is in progress; it is cleared
23836+
once cleanup completes.
23837+
type: object
2382823838
configurationBackup:
2382923839
description: Compressed data from broker configuration to restore
2383023840
broker pod in specific cases

docs/tiered-storage-pvc-resize.md

Lines changed: 56 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -7,70 +7,95 @@ with the rolling upgrade machinery so only one broker is affected at a time.
77

88
---
99

10-
## Annotations
10+
## State tracking
1111

12-
Two annotations are written on PVC objects to carry state across reconcile cycles.
13-
They survive reconciler restarts, making every step re-entrant.
12+
Resize state is stored in the `KafkaCluster` CR status under
13+
`status.brokersState[<brokerId>].cacheVolumeStates`, keyed by mount path.
14+
This keeps the KafkaCluster CR the single source of truth for all in-flight
15+
broker operations and avoids a second, parallel state store on PVC objects.
1416

15-
| Annotation | Value | Written on | Meaning |
16-
|------------|-------|------------|---------|
17-
| `koperator.adobe.com/cache-resize-state` | `pending-deletion` | Old PVC | Being replaced; excluded from pod spec; deleted once broker pod stops |
18-
| `koperator.adobe.com/cache-resize-state` | `replacement` | New PVC | Replacement PVC; rolling upgrade must complete before annotations are stripped |
19-
| `koperator.adobe.com/replaces-pvc` | `<old-pvc-name>` | New PVC | Traceability — records which PVC is being replaced |
17+
| Field | Value | Meaning |
18+
|-------|-------|---------|
19+
| `status.brokersState[N].cacheVolumeStates[<mountPath>]` | `pending-deletion` | A resize is in flight for this mount path. The old PVC (larger size) is waiting to be deleted once the broker pod stops; the replacement PVC (desired smaller size) has already been created. |
20+
21+
The entry is cleared once the old PVC has been deleted and the broker pod has
22+
restarted. An empty map means no resize is in progress.
23+
24+
Two PVC annotations that describe what a PVC **is** (not operational state) are
25+
always present on cache PVCs:
26+
27+
| Annotation | Value | Purpose |
28+
|------------|-------|---------|
29+
| `mountPath` | `<path>` | Used throughout reconcile logic to match PVCs to storage configs |
30+
| `tieredStorageCache` | `"true"` | Identifies cache PVCs for special handling: skipped from `log.dirs` and CC capacity config |
2031

2132
---
2233

2334
## Resize flow
2435

2536
### Cycle N — resize detected, pod running
2637

27-
1. The old PVC is annotated `pending-deletion`.
28-
2. A replacement PVC with the new (smaller) size is created and annotated `replacement`. Provisioning starts immediately.
29-
3. The broker's `ConfigurationState` is set to `ConfigOutOfSync` to trigger a rolling restart via `handleRollingUpgrade`.
30-
4. `handleRollingUpgrade` evaluates health gates (replica health, concurrent restart limit, rack awareness). If all pass the broker pod is deleted and the cycle requeues. If any gate fails the state is preserved in PVC annotations and retried next cycle.
38+
1. `status.brokersState[N].cacheVolumeStates[<mountPath>]` is set to `pending-deletion`
39+
in the KafkaCluster CR status. This is the durable record that a resize is in flight.
40+
2. A replacement PVC with the new (smaller) size is created. Provisioning starts immediately.
41+
3. The broker's `ConfigurationState` is set to `ConfigOutOfSync` to trigger a rolling restart
42+
via `handleRollingUpgrade`.
43+
4. `handleRollingUpgrade` evaluates health gates (replica health, concurrent restart limit,
44+
rack awareness). If all pass the broker pod is deleted and the cycle requeues. If any gate
45+
fails the state persists in the CR and is retried next cycle.
3146

3247
### Cycle N+1 — pod is absent
3348

34-
A pod is considered absent when it either does not exist or has a non-nil `DeletionTimestamp` (Terminating). Treating a Terminating pod as absent allows cleanup to start during the pod's Terminating window rather than waiting for it to fully disappear from etcd.
49+
A pod is considered absent when it either does not exist or has a non-nil
50+
`DeletionTimestamp` (Terminating). Treating a Terminating pod as absent allows
51+
cleanup to start during the pod's Terminating window rather than waiting for it
52+
to fully disappear from etcd.
3553

36-
1. The pending-deletion PVC is deleted.
37-
2. A new broker pod is created referencing the replacement PVC. Because provisioning started in cycle N the PVC is likely already `Bound`, minimising startup latency.
54+
1. The old PVC (the one whose size differs from the desired size at that mount path)
55+
is deleted.
56+
2. The `cacheVolumeStates` entry for that mount path is cleared from the CR status.
57+
3. A new broker pod is created referencing the replacement PVC. Because provisioning
58+
started in cycle N the PVC is likely already `Bound`, minimising startup latency.
3859

3960
### Cycle N+2 — pod is present again
4061

41-
The strip fires as soon as a non-Terminating pod exists for the broker and no pending-deletion PVC remains — the pod does not need to be fully Running.
42-
43-
1. No pending-deletion PVC remains and the replacement PVC exists → resize is complete.
44-
2. The `cache-resize-state` and `replaces-pvc` annotations are stripped from the replacement PVC, which becomes an ordinary PVC from this point forward.
62+
1. No `cacheVolumeStates` entry remains for the mount path → resize is complete.
63+
2. The replacement PVC is now an ordinary cache PVC with no special state attached.
4564

4665
---
4766

4867
## Grow vs shrink
4968

50-
A cache PVC **grow** takes the normal Kubernetes in-place expansion path: the PVC spec is updated with the larger size and Kubernetes expands the volume without a pod restart (requires `allowVolumeExpansion: true` on the StorageClass). No annotations are written and no rolling restart is triggered.
69+
A cache PVC **grow** takes the normal Kubernetes in-place expansion path: the PVC
70+
spec is updated with the larger size and Kubernetes expands the volume without a
71+
pod restart (requires `allowVolumeExpansion: true` on the StorageClass). No
72+
`cacheVolumeStates` entry is written and no rolling restart is triggered.
5173

52-
A cache PVC **shrink** uses the delete-and-recreate flow described above. Shrinking is only supported for tiered storage cache volumes — regular Kafka log volumes reject any size decrease.
74+
A cache PVC **shrink** uses the delete-and-recreate flow described above.
75+
Shrinking is only supported for tiered storage cache volumes — regular Kafka log
76+
volumes reject any size decrease with an error.
5377

5478
---
5579

5680
## Properties of this design
5781

5882
| Property | Value |
5983
|----------|-------|
60-
| State survives reconciler crash | Mostly — PVC annotations are durable in etcd; the one non-re-entrant window is between annotating the old PVC and creating the replacement, but `ConfigOutOfSync` set in that cycle persists in broker status so the rolling upgrade still proceeds |
61-
| Atomicity gap | Eliminated — new PVC is created before old is deleted |
62-
| Provisioning overlaps gate evaluation | Yes — new PVC created in cycle N, not N+1 |
63-
| Observable via kubectl | Yes — `kubectl get pvc -o yaml` shows resize state directly |
64-
| ConfigOutOfSync overloading | Reduced — `ConfigOutOfSync` still used, but the *reason* is legible in PVC annotations |
65-
| CC disk rebalance for cache PVCs | Fixed — tiered cache PVCs are explicitly excluded from `GracefulDiskRebalanceRequired` logic |
84+
| State survives reconciler crash | Yes — `cacheVolumeStates` is written to the KafkaCluster CR (etcd) before the replacement PVC is created; every step is re-entrant |
85+
| Single source of truth | Yes — all broker state (configuration, graceful actions, cache resize) lives in `status.brokersState` |
86+
| Atomicity gap | Eliminated — replacement PVC is created before old is deleted |
87+
| Provisioning overlaps gate evaluation | Yes — replacement PVC created in cycle N, not N+1 |
88+
| Observable via kubectl | Yes — `kubectl get kafkacluster <name> -o jsonpath='{.status.brokersState}'` shows resize state; an empty `cacheVolumeStates` means no resize is in progress |
89+
| CC disk rebalance for cache PVCs | Excluded — tiered cache PVCs are explicitly skipped from `GracefulDiskRebalanceRequired` and CC capacity config |
90+
| `log.dirs` for cache PVCs | Excluded — `generateStorageConfig` skips volumes with `TieredStorageCache: true` |
6691

6792
---
6893

6994
## Sequence diagram
7095

7196
```
7297
Cycle N (pod UP, resize detected)
73-
├─ annotate old PVC: pending-deletion
98+
├─ set cacheVolumeStates[mountPath] = pending-deletion in CR status
7499
├─ create replacement PVC (provisioning starts)
75100
├─ set ConfigOutOfSync
76101
└─ handleRollingUpgrade
@@ -81,9 +106,10 @@ Cycle N+k (pod UP, gates failing — any number of cycles)
81106
└─ ensure ConfigOutOfSync, requeue
82107
83108
Cycle N+k+1 (pod ABSENT — gone or Terminating)
84-
├─ delete pending-deletion PVC
109+
├─ delete old PVC (identified as the PVC at mountPath whose size ≠ desired)
110+
├─ clear cacheVolumeStates[mountPath] from CR status
85111
└─ create new pod bound to replacement PVC
86112
87113
Cycle N+k+2 (pod PRESENT — non-Terminating, not necessarily Running)
88-
└─ strip annotations → replacement PVC becomes ordinary PVC
114+
└─ cacheVolumeStates entry is absent → resize complete, no further action
89115
```

pkg/k8sutil/status.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,17 @@ func generateBrokerState(brokerIDs []string, cluster *banzaicloudv1beta1.KafkaCl
174174
case banzaicloudv1beta1.KafkaVersion:
175175
brokerState.Image = s.Image
176176
brokerState.Version = s.Version
177+
case map[string]banzaicloudv1beta1.CacheResizeState:
178+
if brokerState.CacheVolumeStates == nil {
179+
brokerState.CacheVolumeStates = make(map[string]banzaicloudv1beta1.CacheResizeState)
180+
}
181+
for mountPath, state := range s {
182+
if state == "" {
183+
delete(brokerState.CacheVolumeStates, mountPath)
184+
} else {
185+
brokerState.CacheVolumeStates[mountPath] = state
186+
}
187+
}
177188
}
178189
brokersState[brokerID] = brokerState
179190
}

pkg/resources/kafka/configmap.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,6 +411,10 @@ func getMountPathsFromBrokerConfigMap(configMap *corev1.ConfigMap) ([]string, er
411411
func generateStorageConfig(sConfig []v1beta1.StorageConfig) []string {
412412
mountPaths := make([]string, 0, len(sConfig))
413413
for _, storage := range sConfig {
414+
// Tiered storage cache volumes are not Kafka log dirs — exclude them from log.dirs.
415+
if storage.TieredStorageCache {
416+
continue
417+
}
414418
mountPaths = append(mountPaths, util.StorageConfigKafkaMountPath(storage.MountPath))
415419
}
416420
return mountPaths

0 commit comments

Comments
 (0)