Skip to content

Commit 353780c

Browse files
committed
Make deployments HA-ready with configurable replica count
1 parent aaffdeb commit 353780c

13 files changed

+194
-81
lines changed

Makefile

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ MANIFESTS ?= $(STANDARD_MANIFEST) $(STANDARD_E2E_MANIFEST) $(EXPERIMENTAL_MANIFE
161161
$(STANDARD_MANIFEST) ?= helm/cert-manager.yaml
162162
$(STANDARD_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/e2e.yaml
163163
$(EXPERIMENTAL_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml
164-
$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml
164+
$(EXPERIMENTAL_E2E_MANIFEST) ?= helm/cert-manager.yaml helm/experimental.yaml helm/e2e.yaml helm/high-availability.yaml
165165
HELM_SETTINGS ?=
166166
.PHONY: $(MANIFESTS)
167167
$(MANIFESTS): $(HELM)
@@ -218,7 +218,7 @@ test: manifests generate fmt lint test-unit test-e2e test-regression #HELP Run a
218218

219219
.PHONY: e2e
220220
e2e: #EXHELP Run the e2e tests.
221-
go test -count=1 -v ./test/e2e/features_test.go
221+
go test -count=1 -v -timeout=20m ./test/e2e/...
222222

223223
E2E_REGISTRY_NAME := docker-registry
224224
E2E_REGISTRY_NAMESPACE := operator-controller-e2e
@@ -489,8 +489,8 @@ run-experimental: run-internal #HELP Build the operator-controller then deploy i
489489
CATD_NAMESPACE := olmv1-system
490490
.PHONY: wait
491491
wait:
492-
kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=60s
493-
kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert # Avoid upgrade test flakes when reissuing cert
492+
kubectl wait --for=condition=Available --namespace=$(CATD_NAMESPACE) deployment/catalogd-controller-manager --timeout=3m
493+
kubectl wait --for=condition=Ready --namespace=$(CATD_NAMESPACE) certificate/catalogd-service-cert --timeout=3m # Avoid upgrade test flakes when reissuing cert
494494

495495
.PHONY: docker-build
496496
docker-build: build-linux #EXHELP Build docker image for operator-controller and catalog with GOOS=linux and local GOARCH.

hack/test/pre-upgrade-setup.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -155,5 +155,5 @@ spec:
155155
version: 1.0.0
156156
EOF
157157

158-
kubectl wait --for=condition=Serving --timeout=60s ClusterCatalog $TEST_CLUSTER_CATALOG_NAME
159-
kubectl wait --for=condition=Installed --timeout=60s ClusterExtension $TEST_CLUSTER_EXTENSION_NAME
158+
kubectl wait --for=condition=Serving --timeout=5m ClusterCatalog $TEST_CLUSTER_CATALOG_NAME
159+
kubectl wait --for=condition=Installed --timeout=5m ClusterExtension $TEST_CLUSTER_EXTENSION_NAME

helm/high-availability.yaml

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
# High Availability (HA) configuration for OLMv1
2+
# Sets replicas to 2 for both operator-controller and catalogd to enable HA setup
3+
# This is used in experimental-e2e.yaml to test multi-replica deployments
4+
#
5+
# Pod anti-affinity is configured as "preferred" (not "required") to ensure:
6+
# - In multi-node clusters: replicas are scheduled on different nodes for better availability
7+
# - In single-node clusters (like kind): both replicas can still be scheduled on the same node
8+
options:
9+
operatorController:
10+
deployment:
11+
replicas: 2
12+
catalogd:
13+
deployment:
14+
replicas: 2
15+
16+
# Pod anti-affinity configuration to prefer spreading replicas across different nodes
17+
# Uses preferredDuringSchedulingIgnoredDuringExecution (soft constraint) to allow
18+
# scheduling on the same node when necessary (e.g., single-node kind clusters for e2e tests)
19+
deployments:
20+
templateSpec:
21+
affinity:
22+
podAntiAffinity:
23+
preferredDuringSchedulingIgnoredDuringExecution:
24+
- weight: 100
25+
podAffinityTerm:
26+
labelSelector:
27+
matchExpressions:
28+
- key: control-plane
29+
operator: In
30+
values:
31+
- operator-controller-controller-manager
32+
- catalogd-controller-manager
33+
topologyKey: kubernetes.io/hostname

helm/olmv1/templates/deployment-olmv1-system-catalogd-controller-manager.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@ metadata:
1212
namespace: {{ .Values.namespaces.olmv1.name }}
1313
spec:
1414
minReadySeconds: 5
15-
replicas: 1
15+
replicas: {{ .Values.options.catalogd.deployment.replicas }}
1616
strategy:
1717
type: RollingUpdate
1818
rollingUpdate:
19-
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
19+
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
2020
maxUnavailable: 0 # Never allow pods to be unavailable during updates
2121
selector:
2222
matchLabels:

helm/olmv1/templates/deployment-olmv1-system-operator-controller-controller-manager.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,11 @@ metadata:
1111
name: operator-controller-controller-manager
1212
namespace: {{ .Values.namespaces.olmv1.name }}
1313
spec:
14-
replicas: 1
14+
replicas: {{ .Values.options.operatorController.deployment.replicas }}
1515
strategy:
1616
type: RollingUpdate
1717
rollingUpdate:
18-
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
18+
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
1919
maxUnavailable: 0 # Never allow pods to be unavailable during updates
2020
selector:
2121
matchLabels:

helm/olmv1/values.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ options:
88
enabled: true
99
deployment:
1010
image: quay.io/operator-framework/operator-controller:devel
11+
replicas: 1
1112
extraArguments: []
1213
features:
1314
enabled: []
@@ -19,6 +20,7 @@ options:
1920
enabled: true
2021
deployment:
2122
image: quay.io/operator-framework/catalogd:devel
23+
replicas: 1
2224
extraArguments: []
2325
features:
2426
enabled: []

manifests/experimental-e2e.yaml

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2198,11 +2198,11 @@ metadata:
21982198
namespace: olmv1-system
21992199
spec:
22002200
minReadySeconds: 5
2201-
replicas: 1
2201+
replicas: 2
22022202
strategy:
22032203
type: RollingUpdate
22042204
rollingUpdate:
2205-
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
2205+
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
22062206
maxUnavailable: 0 # Never allow pods to be unavailable during updates
22072207
selector:
22082208
matchLabels:
@@ -2315,6 +2315,18 @@ spec:
23152315
operator: In
23162316
values:
23172317
- linux
2318+
podAntiAffinity:
2319+
preferredDuringSchedulingIgnoredDuringExecution:
2320+
- podAffinityTerm:
2321+
labelSelector:
2322+
matchExpressions:
2323+
- key: control-plane
2324+
operator: In
2325+
values:
2326+
- operator-controller-controller-manager
2327+
- catalogd-controller-manager
2328+
topologyKey: kubernetes.io/hostname
2329+
weight: 100
23182330
nodeSelector:
23192331
kubernetes.io/os: linux
23202332
node-role.kubernetes.io/control-plane: ""
@@ -2349,11 +2361,11 @@ metadata:
23492361
name: operator-controller-controller-manager
23502362
namespace: olmv1-system
23512363
spec:
2352-
replicas: 1
2364+
replicas: 2
23532365
strategy:
23542366
type: RollingUpdate
23552367
rollingUpdate:
2356-
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
2368+
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
23572369
maxUnavailable: 0 # Never allow pods to be unavailable during updates
23582370
selector:
23592371
matchLabels:
@@ -2474,6 +2486,18 @@ spec:
24742486
operator: In
24752487
values:
24762488
- linux
2489+
podAntiAffinity:
2490+
preferredDuringSchedulingIgnoredDuringExecution:
2491+
- podAffinityTerm:
2492+
labelSelector:
2493+
matchExpressions:
2494+
- key: control-plane
2495+
operator: In
2496+
values:
2497+
- operator-controller-controller-manager
2498+
- catalogd-controller-manager
2499+
topologyKey: kubernetes.io/hostname
2500+
weight: 100
24772501
nodeSelector:
24782502
kubernetes.io/os: linux
24792503
node-role.kubernetes.io/control-plane: ""

manifests/experimental.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2123,7 +2123,7 @@ spec:
21232123
strategy:
21242124
type: RollingUpdate
21252125
rollingUpdate:
2126-
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
2126+
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
21272127
maxUnavailable: 0 # Never allow pods to be unavailable during updates
21282128
selector:
21292129
matchLabels:
@@ -2261,7 +2261,7 @@ spec:
22612261
strategy:
22622262
type: RollingUpdate
22632263
rollingUpdate:
2264-
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
2264+
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
22652265
maxUnavailable: 0 # Never allow pods to be unavailable during updates
22662266
selector:
22672267
matchLabels:

manifests/standard-e2e.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1803,7 +1803,7 @@ spec:
18031803
strategy:
18041804
type: RollingUpdate
18051805
rollingUpdate:
1806-
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
1806+
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
18071807
maxUnavailable: 0 # Never allow pods to be unavailable during updates
18081808
selector:
18091809
matchLabels:
@@ -1953,7 +1953,7 @@ spec:
19531953
strategy:
19541954
type: RollingUpdate
19551955
rollingUpdate:
1956-
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
1956+
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
19571957
maxUnavailable: 0 # Never allow pods to be unavailable during updates
19581958
selector:
19591959
matchLabels:

manifests/standard.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1724,7 +1724,7 @@ spec:
17241724
strategy:
17251725
type: RollingUpdate
17261726
rollingUpdate:
1727-
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
1727+
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
17281728
maxUnavailable: 0 # Never allow pods to be unavailable during updates
17291729
selector:
17301730
matchLabels:
@@ -1861,7 +1861,7 @@ spec:
18611861
strategy:
18621862
type: RollingUpdate
18631863
rollingUpdate:
1864-
maxSurge: 1 # Allow temporary 2 pods (1 + 1) for zero-downtime updates
1864+
maxSurge: 1 # Allow temporary extra pod for zero-downtime updates
18651865
maxUnavailable: 0 # Never allow pods to be unavailable during updates
18661866
selector:
18671867
matchLabels:

0 commit comments

Comments
 (0)