Skip to content

Commit 3ec13e6

Browse files
authored
feat: Horizontal scaling support to the Feast operator (#6000)
* feat: Horizontal scaling support to the Feast operator Signed-off-by: ntkathole <nikhilkathole2683@gmail.com> * docs: Added blog post on horizontal scaling Signed-off-by: ntkathole <nikhilkathole2683@gmail.com> * fix: Address comments Signed-off-by: ntkathole <nikhilkathole2683@gmail.com> * fix: Server side apply Signed-off-by: ntkathole <nikhilkathole2683@gmail.com> * fix: Use autoscalingv2.HorizontalPodAutoscaler Signed-off-by: ntkathole <nikhilkathole2683@gmail.com> --------- Signed-off-by: ntkathole <nikhilkathole2683@gmail.com>
1 parent bc1d84f commit 3ec13e6

File tree

18 files changed

+4270
-40
lines changed

18 files changed

+4270
-40
lines changed

.secrets.baseline

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@
9090
{
9191
"path": "detect_secrets.filters.allowlist.is_line_allowlisted"
9292
},
93+
{
94+
"path": "detect_secrets.filters.common.is_baseline_file",
95+
"filename": ".secrets.baseline"
96+
},
9397
{
9498
"path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies",
9599
"min_level": 2
@@ -930,7 +934,7 @@
930934
"filename": "infra/feast-operator/api/v1/featurestore_types.go",
931935
"hashed_secret": "44e17306b837162269a410204daaa5ecee4ec22c",
932936
"is_verified": false,
933-
"line_number": 657
937+
"line_number": 696
934938
}
935939
],
936940
"infra/feast-operator/api/v1/zz_generated.deepcopy.go": [
@@ -939,21 +943,21 @@
939943
"filename": "infra/feast-operator/api/v1/zz_generated.deepcopy.go",
940944
"hashed_secret": "f914fc9324de1bec1ad13dec94a8ea2ddb41fc87",
941945
"is_verified": false,
942-
"line_number": 615
946+
"line_number": 663
943947
},
944948
{
945949
"type": "Secret Keyword",
946950
"filename": "infra/feast-operator/api/v1/zz_generated.deepcopy.go",
947951
"hashed_secret": "44e17306b837162269a410204daaa5ecee4ec22c",
948952
"is_verified": false,
949-
"line_number": 1123
953+
"line_number": 1206
950954
},
951955
{
952956
"type": "Secret Keyword",
953957
"filename": "infra/feast-operator/api/v1/zz_generated.deepcopy.go",
954958
"hashed_secret": "c2028031c154bbe86fd69bef740855c74b927dcf",
955959
"is_verified": false,
956-
"line_number": 1128
960+
"line_number": 1211
957961
}
958962
],
959963
"infra/feast-operator/api/v1alpha1/featurestore_types.go": [
@@ -1152,7 +1156,7 @@
11521156
"filename": "infra/feast-operator/internal/controller/services/services.go",
11531157
"hashed_secret": "36dc326eb15c7bdd8d91a6b87905bcea20b637d1",
11541158
"is_verified": false,
1155-
"line_number": 164
1159+
"line_number": 173
11561160
}
11571161
],
11581162
"infra/feast-operator/internal/controller/services/tls_test.go": [
@@ -1535,5 +1539,5 @@
15351539
}
15361540
]
15371541
},
1538-
"generated_at": "2026-02-19T06:53:49Z"
1542+
"generated_at": "2026-02-26T14:08:35Z"
15391543
}

docs/how-to-guides/feast-on-kubernetes.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,9 @@ spec:
6565
> _More advanced FeatureStore CR examples can be found in the feast-operator [samples directory](../../infra/feast-operator/config/samples)._
6666
6767
{% hint style="success" %}
68-
Important note: Scaling a Feature Store Deployment should only be done if the configured data store(s) will support it.
68+
**Scaling:** The Feast Operator supports horizontal scaling via static replicas, HPA autoscaling, or external autoscalers like [KEDA](https://keda.sh). Scaling requires DB-backed persistence for all enabled services.
6969
70-
Please check the how-to guide for some specific recommendations on [how to scale Feast](./scaling-feast.md).
70+
See the [Horizontal Scaling with the Feast Operator](./scaling-feast.md#horizontal-scaling-with-the-feast-operator) guide for configuration details, or check the general recommendations on [how to scale Feast](./scaling-feast.md).
7171
{% endhint %}
72+
73+
> _Sample scaling CRs are available at [`v1_featurestore_scaling_static.yaml`](../../infra/feast-operator/config/samples/v1_featurestore_scaling_static.yaml) and [`v1_featurestore_scaling_hpa.yaml`](../../infra/feast-operator/config/samples/v1_featurestore_scaling_hpa.yaml)._

docs/how-to-guides/scaling-feast.md

Lines changed: 155 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,158 @@ However, this process does not scale for large data sets, since it's executed on
2323
Feast supports pluggable [Compute Engines](../getting-started/components/compute-engine.md), that allow the materialization process to be scaled up.
2424
Aside from the local process, Feast supports a [Lambda-based materialization engine](https://rtd.feast.dev/en/master/#alpha-lambda-based-engine), and a [Bytewax-based materialization engine](https://rtd.feast.dev/en/master/#bytewax-engine).
2525

26-
Users may also be able to build an engine to scale up materialization using existing infrastructure in their organizations.
26+
Users may also be able to build an engine to scale up materialization using existing infrastructure in their organizations.
27+
28+
### Horizontal Scaling with the Feast Operator
29+
30+
When running Feast on Kubernetes with the [Feast Operator](./feast-on-kubernetes.md), you can horizontally scale the FeatureStore deployment using `spec.replicas` or HPA autoscaling. The FeatureStore CRD implements the Kubernetes [scale sub-resource](https://kubernetes.io/docs/tasks/extend-kubernetes/custom-resources/custom-resource-definitions/#scale-subresource), so you can also use `kubectl scale`:
31+
32+
```bash
33+
kubectl scale featurestore/my-feast --replicas=3
34+
```
35+
36+
**Prerequisites:** Horizontal scaling requires **DB-backed persistence** for all enabled services (online store, offline store, and registry). File-based persistence (SQLite, DuckDB, `registry.db`) is incompatible with multiple replicas because these backends do not support concurrent access from multiple pods.
37+
38+
#### Static Replicas
39+
40+
Set a fixed number of replicas via `spec.replicas`:
41+
42+
```yaml
43+
apiVersion: feast.dev/v1
44+
kind: FeatureStore
45+
metadata:
46+
name: sample-scaling
47+
spec:
48+
feastProject: my_project
49+
replicas: 3
50+
services:
51+
onlineStore:
52+
persistence:
53+
store:
54+
type: postgres
55+
secretRef:
56+
name: feast-data-stores
57+
registry:
58+
local:
59+
persistence:
60+
store:
61+
type: sql
62+
secretRef:
63+
name: feast-data-stores
64+
```
65+
66+
#### Autoscaling with HPA
67+
68+
Configure a HorizontalPodAutoscaler to dynamically scale based on metrics. HPA autoscaling is configured under `services.scaling.autoscaling` and is mutually exclusive with `spec.replicas > 1`:
69+
70+
```yaml
71+
apiVersion: feast.dev/v1
72+
kind: FeatureStore
73+
metadata:
74+
name: sample-autoscaling
75+
spec:
76+
feastProject: my_project
77+
services:
78+
scaling:
79+
autoscaling:
80+
minReplicas: 2
81+
maxReplicas: 10
82+
metrics:
83+
- type: Resource
84+
resource:
85+
name: cpu
86+
target:
87+
type: Utilization
88+
averageUtilization: 70
89+
onlineStore:
90+
persistence:
91+
store:
92+
type: postgres
93+
secretRef:
94+
name: feast-data-stores
95+
server:
96+
resources:
97+
requests:
98+
cpu: 200m
99+
memory: 256Mi
100+
registry:
101+
local:
102+
persistence:
103+
store:
104+
type: sql
105+
secretRef:
106+
name: feast-data-stores
107+
```
108+
109+
{% hint style="info" %}
110+
When autoscaling is configured, the operator automatically sets the deployment strategy to `RollingUpdate` (instead of the default `Recreate`) to ensure zero-downtime scaling. You can override this by explicitly setting `deploymentStrategy` in the CR.
111+
{% endhint %}
112+
113+
#### Validation Rules
114+
115+
The operator enforces the following rules:
116+
- `spec.replicas > 1` and `services.scaling.autoscaling` are **mutually exclusive** -- you cannot set both.
117+
- Scaling with `replicas > 1` or any `autoscaling` config is **rejected** if any enabled service uses file-based persistence.
118+
- S3 (`s3://`) and GCS (`gs://`) backed registry file persistence is allowed with scaling, since these object stores support concurrent readers.
119+
120+
#### Using KEDA (Kubernetes Event-Driven Autoscaling)
121+
122+
[KEDA](https://keda.sh) is also supported as an external autoscaler. KEDA should target the FeatureStore's scale sub-resource directly (since it implements the Kubernetes scale API). This is the recommended approach because the operator manages the Deployment's replica count from `spec.replicas` — targeting the Deployment directly would conflict with the operator's reconciliation.
123+
124+
When using KEDA, do **not** set `scaling.autoscaling` or `spec.replicas > 1` -- KEDA manages the replica count through the scale sub-resource.
125+
126+
1. **Ensure DB-backed persistence** -- The CRD's CEL validation rules automatically enforce DB-backed persistence when KEDA scales `spec.replicas` above 1 via the scale sub-resource. The operator also automatically switches the deployment strategy to `RollingUpdate` when `replicas > 1`.
127+
128+
2. **Configure the FeatureStore** with DB-backed persistence:
129+
130+
```yaml
131+
apiVersion: feast.dev/v1
132+
kind: FeatureStore
133+
metadata:
134+
name: sample-keda
135+
spec:
136+
feastProject: my_project
137+
services:
138+
onlineStore:
139+
persistence:
140+
store:
141+
type: postgres
142+
secretRef:
143+
name: feast-data-stores
144+
registry:
145+
local:
146+
persistence:
147+
store:
148+
type: sql
149+
secretRef:
150+
name: feast-data-stores
151+
```
152+
153+
3. **Create a KEDA `ScaledObject`** targeting the FeatureStore resource:
154+
155+
```yaml
156+
apiVersion: keda.sh/v1alpha1
157+
kind: ScaledObject
158+
metadata:
159+
name: feast-scaledobject
160+
spec:
161+
scaleTargetRef:
162+
apiVersion: feast.dev/v1
163+
kind: FeatureStore
164+
name: sample-keda
165+
minReplicaCount: 1
166+
maxReplicaCount: 10
167+
triggers:
168+
- type: prometheus
169+
metadata:
170+
serverAddress: http://prometheus.monitoring.svc:9090
171+
metricName: http_requests_total
172+
query: sum(rate(http_requests_total{service="feast"}[2m]))
173+
threshold: "100"
174+
```
175+
176+
{% hint style="warning" %}
177+
KEDA-created HPAs are not owned by the Feast operator. The operator will not interfere with them, but it also will not clean them up if the FeatureStore CR is deleted. You must manage the KEDA `ScaledObject` lifecycle independently.
178+
{% endhint %}
179+
180+
For the full API reference, see the [FeatureStore CRD reference](../../infra/feast-operator/docs/api/markdown/ref.md).

infra/feast-operator/api/v1/featurestore_types.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package v1
1818

1919
import (
2020
appsv1 "k8s.io/api/apps/v1"
21+
autoscalingv2 "k8s.io/api/autoscaling/v2"
2122
batchv1 "k8s.io/api/batch/v1"
2223
corev1 "k8s.io/api/core/v1"
2324
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -67,6 +68,10 @@ const (
6768
)
6869

6970
// FeatureStoreSpec defines the desired state of FeatureStore
71+
// +kubebuilder:validation:XValidation:rule="self.replicas <= 1 || !has(self.services) || !has(self.services.scaling) || !has(self.services.scaling.autoscaling)",message="replicas > 1 and services.scaling.autoscaling are mutually exclusive."
72+
// +kubebuilder:validation:XValidation:rule="self.replicas <= 1 && (!has(self.services) || !has(self.services.scaling) || !has(self.services.scaling.autoscaling)) || (has(self.services) && has(self.services.onlineStore) && has(self.services.onlineStore.persistence) && has(self.services.onlineStore.persistence.store))",message="Scaling requires DB-backed persistence for the online store. Configure services.onlineStore.persistence.store when using replicas > 1 or autoscaling."
73+
// +kubebuilder:validation:XValidation:rule="self.replicas <= 1 && (!has(self.services) || !has(self.services.scaling) || !has(self.services.scaling.autoscaling)) || (!has(self.services) || !has(self.services.offlineStore) || (has(self.services.offlineStore.persistence) && has(self.services.offlineStore.persistence.store)))",message="Scaling requires DB-backed persistence for the offline store. Configure services.offlineStore.persistence.store when using replicas > 1 or autoscaling."
74+
// +kubebuilder:validation:XValidation:rule="self.replicas <= 1 && (!has(self.services) || !has(self.services.scaling) || !has(self.services.scaling.autoscaling)) || (has(self.services) && has(self.services.registry) && (has(self.services.registry.remote) || (has(self.services.registry.local) && has(self.services.registry.local.persistence) && (has(self.services.registry.local.persistence.store) || (has(self.services.registry.local.persistence.file) && has(self.services.registry.local.persistence.file.path) && (self.services.registry.local.persistence.file.path.startsWith('s3://') || self.services.registry.local.persistence.file.path.startsWith('gs://')))))))",message="Scaling requires DB-backed or remote registry. Configure registry.local.persistence.store or use a remote registry when using replicas > 1 or autoscaling. S3/GCS-backed registry is also allowed."
7075
type FeatureStoreSpec struct {
7176
// +kubebuilder:validation:Pattern="^[A-Za-z0-9][A-Za-z0-9_-]*$"
7277
// FeastProject is the Feast project id. This can be any alphanumeric string with underscores and hyphens, but it cannot start with an underscore or hyphen. Required.
@@ -76,6 +81,11 @@ type FeatureStoreSpec struct {
7681
AuthzConfig *AuthzConfig `json:"authz,omitempty"`
7782
CronJob *FeastCronJob `json:"cronJob,omitempty"`
7883
BatchEngine *BatchEngineConfig `json:"batchEngine,omitempty"`
84+
// Replicas is the desired number of pod replicas. Used by the scale sub-resource.
85+
// Mutually exclusive with services.scaling.autoscaling.
86+
// +kubebuilder:default=1
87+
// +kubebuilder:validation:Minimum=1
88+
Replicas *int32 `json:"replicas"`
7989
}
8090

8191
// FeastProjectDir defines how to create the feast project directory.
@@ -301,6 +311,35 @@ type FeatureStoreServices struct {
301311
DisableInitContainers bool `json:"disableInitContainers,omitempty"`
302312
// Volumes specifies the volumes to mount in the FeatureStore deployment. A corresponding `VolumeMount` should be added to whichever feast service(s) require access to said volume(s).
303313
Volumes []corev1.Volume `json:"volumes,omitempty"`
314+
// Scaling configures horizontal scaling for the FeatureStore deployment (e.g. HPA autoscaling).
315+
// For static replicas, use spec.replicas instead.
316+
Scaling *ScalingConfig `json:"scaling,omitempty"`
317+
}
318+
319+
// ScalingConfig configures horizontal scaling for the FeatureStore deployment.
320+
type ScalingConfig struct {
321+
// Autoscaling configures a HorizontalPodAutoscaler for the FeatureStore deployment.
322+
// Mutually exclusive with spec.replicas.
323+
// +optional
324+
Autoscaling *AutoscalingConfig `json:"autoscaling,omitempty"`
325+
}
326+
327+
// AutoscalingConfig defines HPA settings for the FeatureStore deployment.
328+
type AutoscalingConfig struct {
329+
// MinReplicas is the lower limit for the number of replicas. Defaults to 1.
330+
// +kubebuilder:validation:Minimum=1
331+
// +optional
332+
MinReplicas *int32 `json:"minReplicas,omitempty"`
333+
// MaxReplicas is the upper limit for the number of replicas. Required.
334+
// +kubebuilder:validation:Minimum=1
335+
MaxReplicas int32 `json:"maxReplicas"`
336+
// Metrics contains the specifications for which to use to calculate the desired replica count.
337+
// If not set, defaults to 80% CPU utilization.
338+
// +optional
339+
Metrics []autoscalingv2.MetricSpec `json:"metrics,omitempty"`
340+
// Behavior configures the scaling behavior of the target.
341+
// +optional
342+
Behavior *autoscalingv2.HorizontalPodAutoscalerBehavior `json:"behavior,omitempty"`
304343
}
305344

306345
// OfflineStore configures the offline store service
@@ -690,6 +729,20 @@ type FeatureStoreStatus struct {
690729
FeastVersion string `json:"feastVersion,omitempty"`
691730
Phase string `json:"phase,omitempty"`
692731
ServiceHostnames ServiceHostnames `json:"serviceHostnames,omitempty"`
732+
// Replicas is the current number of ready pod replicas (used by the scale sub-resource).
733+
Replicas int32 `json:"replicas,omitempty"`
734+
// Selector is the label selector for pods managed by the FeatureStore deployment (used by the scale sub-resource).
735+
Selector string `json:"selector,omitempty"`
736+
// ScalingStatus reports the current scaling state of the FeatureStore deployment.
737+
ScalingStatus *ScalingStatus `json:"scalingStatus,omitempty"`
738+
}
739+
740+
// ScalingStatus reports the observed scaling state.
741+
type ScalingStatus struct {
742+
// CurrentReplicas is the current number of pod replicas.
743+
CurrentReplicas int32 `json:"currentReplicas,omitempty"`
744+
// DesiredReplicas is the desired number of pod replicas.
745+
DesiredReplicas int32 `json:"desiredReplicas,omitempty"`
693746
}
694747

695748
// ServiceHostnames defines the service hostnames in the format of <domain>:<port>, e.g. example.svc.cluster.local:80
@@ -706,6 +759,7 @@ type ServiceHostnames struct {
706759
// +kubebuilder:resource:shortName=feast
707760
// +kubebuilder:printcolumn:name="Status",type=string,JSONPath=`.status.phase`
708761
// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp`
762+
// +kubebuilder:subresource:scale:specpath=.spec.replicas,statuspath=.status.replicas,selectorpath=.status.selector
709763
// +kubebuilder:storageversion
710764

711765
// FeatureStore is the Schema for the featurestores API

0 commit comments

Comments
 (0)