Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 100 additions & 8 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,20 @@ KIND_CLUSTER_NAME := carbide-rest-local
KUSTOMIZE_OVERLAY := deploy/kustomize/overlays/local
LOCAL_DOCKERFILE_DIR := docker/local

# Set LOCAL_CORE=true to connect the site-agent to a locally-running carbide-api
# (started via dev/mac-local-dev/run-carbide-api.sh in bare-metal-manager-core) instead of mock-core.
# LOCAL_CORE_HOST must be reachable from inside the kind cluster nodes.
# On Docker Desktop and colima, host.docker.internal resolves to both IPv6 and
# IPv4. Go's dialer prefers IPv6, which is unreachable from Kind pods, so the
# Makefile recipe resolves it to IPv4 before patching the ConfigMap.
LOCAL_CORE ?=
LOCAL_CORE_HOST ?= host.docker.internal
LOCAL_CORE_PORT ?= 1079
# Certs from the local carbide-api TLS setup; mounted into the site-agent at
# /etc/carbide/{ca.crt, tls.crt, tls.key}. The site-agent's GetInitialCertMD5()
# reads all three regardless of CARBIDE_SEC_OPT, so they must all be present.
LOCAL_CORE_CERTS_DIR ?= ../ncx-infra-controller-core/dev/certs/localhost

# Recommended colima configuration for full stack with Temporal:
# colima start --cpu 8 --memory 8 --disk 100

Expand All @@ -284,7 +298,9 @@ docker-build-local:
docker build -t $(IMAGE_REGISTRY)/carbide-rest-workflow:$(IMAGE_TAG) -f $(LOCAL_DOCKERFILE_DIR)/Dockerfile.carbide-rest-workflow .
docker build -t $(IMAGE_REGISTRY)/carbide-rest-site-manager:$(IMAGE_TAG) -f $(LOCAL_DOCKERFILE_DIR)/Dockerfile.carbide-rest-site-manager .
docker build -t $(IMAGE_REGISTRY)/carbide-rest-site-agent:$(IMAGE_TAG) -f $(LOCAL_DOCKERFILE_DIR)/Dockerfile.carbide-rest-site-agent .
docker build -t $(IMAGE_REGISTRY)/carbide-rest-mock-core:$(IMAGE_TAG) -f $(LOCAL_DOCKERFILE_DIR)/Dockerfile.carbide-rest-mock-core .
@if [ "$(LOCAL_CORE)" != "true" ]; then \
docker build -t $(IMAGE_REGISTRY)/carbide-rest-mock-core:$(IMAGE_TAG) -f $(LOCAL_DOCKERFILE_DIR)/Dockerfile.carbide-rest-mock-core . ; \
fi
Comment thread
coderabbitai[bot] marked this conversation as resolved.
docker build -t $(IMAGE_REGISTRY)/carbide-rest-db:$(IMAGE_TAG) -f $(LOCAL_DOCKERFILE_DIR)/Dockerfile.carbide-rest-db .
docker build -t $(IMAGE_REGISTRY)/carbide-rest-cert-manager:$(IMAGE_TAG) -f $(LOCAL_DOCKERFILE_DIR)/Dockerfile.carbide-rest-cert-manager .

Expand All @@ -307,7 +323,9 @@ kind-load:
kind load docker-image $(IMAGE_REGISTRY)/carbide-rest-workflow:$(IMAGE_TAG) --name $(KIND_CLUSTER_NAME)
kind load docker-image $(IMAGE_REGISTRY)/carbide-rest-site-manager:$(IMAGE_TAG) --name $(KIND_CLUSTER_NAME)
kind load docker-image $(IMAGE_REGISTRY)/carbide-rest-site-agent:$(IMAGE_TAG) --name $(KIND_CLUSTER_NAME)
kind load docker-image $(IMAGE_REGISTRY)/carbide-rest-mock-core:$(IMAGE_TAG) --name $(KIND_CLUSTER_NAME)
@if [ "$(LOCAL_CORE)" != "true" ]; then \
kind load docker-image $(IMAGE_REGISTRY)/carbide-rest-mock-core:$(IMAGE_TAG) --name $(KIND_CLUSTER_NAME) ; \
fi
kind load docker-image $(IMAGE_REGISTRY)/carbide-rest-db:$(IMAGE_TAG) --name $(KIND_CLUSTER_NAME)
kind load docker-image $(IMAGE_REGISTRY)/carbide-rest-cert-manager:$(IMAGE_TAG) --name $(KIND_CLUSTER_NAME)

Expand Down Expand Up @@ -338,15 +356,19 @@ kind-redeploy: docker-build-local kind-load
kubectl -n carbide-rest rollout restart deployment/carbide-rest-api
kubectl -n carbide-rest rollout restart deployment/carbide-rest-cloud-worker
kubectl -n carbide-rest rollout restart deployment/carbide-rest-site-worker
kubectl -n carbide-rest rollout restart deployment/carbide-rest-mock-core
@if [ "$(LOCAL_CORE)" != "true" ]; then \
kubectl -n carbide-rest rollout restart deployment/carbide-rest-mock-core ; \
fi
kubectl -n carbide-rest rollout restart deployment/carbide-rest-cert-manager
kubectl -n carbide-rest rollout restart deployment/carbide-rest-site-manager
kubectl -n carbide-rest rollout restart statefulset/carbide-rest-site-agent
@echo "Waiting for rollouts..."
kubectl -n carbide-rest rollout status deployment/carbide-rest-api --timeout=120s
kubectl -n carbide-rest rollout status deployment/carbide-rest-cloud-worker --timeout=120s
kubectl -n carbide-rest rollout status deployment/carbide-rest-site-worker --timeout=120s
kubectl -n carbide-rest rollout status deployment/carbide-rest-mock-core --timeout=120s
@if [ "$(LOCAL_CORE)" != "true" ]; then \
kubectl -n carbide-rest rollout status deployment/carbide-rest-mock-core --timeout=120s ; \
fi
kubectl -n carbide-rest rollout status deployment/carbide-rest-cert-manager --timeout=120s
kubectl -n carbide-rest rollout status deployment/carbide-rest-site-manager --timeout=120s
kubectl -n carbide-rest rollout status statefulset/carbide-rest-site-agent --timeout=120s
Expand Down Expand Up @@ -443,9 +465,11 @@ kind-reset-infra: docker-build-local
kubectl apply -k deploy/kustomize/base/keycloak
kubectl -n carbide-rest rollout status deployment/keycloak --timeout=240s

@echo "Setting up Carbide Mock Core (dev only, not in Helm chart)..."
kubectl apply -k deploy/kustomize/overlays/mock-core
kubectl -n carbide-rest rollout status deployment/carbide-rest-mock-core --timeout=240s
@if [ "$(LOCAL_CORE)" != "true" ]; then \
echo "Setting up Carbide Mock Core (dev only, not in Helm chart)..." ; \
kubectl apply -k deploy/kustomize/overlays/mock-core ; \
kubectl -n carbide-rest rollout status deployment/carbide-rest-mock-core --timeout=240s ; \
fi

@echo ""
@echo "================================================================================"
Expand Down Expand Up @@ -489,20 +513,57 @@ kind-reset-kustomize: kind-reset-infra
kubectl apply -k deploy/kustomize/overlays/api
kubectl -n carbide-rest rollout status deployment/carbide-rest-api --timeout=240s

@if [ "$(LOCAL_CORE)" != "true" ]; then \
echo "Setting up Carbide Mock Core..." ; \
kubectl apply -k deploy/kustomize/overlays/mock-core ; \
kubectl -n carbide-rest rollout status deployment/carbide-rest-mock-core --timeout=240s ; \
fi

@echo "Setting up Carbide REST Site Agent..."
kubectl apply -k deploy/kustomize/overlays/site-agent
kubectl -n carbide-rest rollout status statefulset/carbide-rest-site-agent --timeout=240s

@echo "Setting up Site Agent secrets..."
./scripts/setup-local.sh site-agent

@if [ "$(LOCAL_CORE)" = "true" ]; then \
echo "Creating core-grpc-client-site-agent-certs secret from $(LOCAL_CORE_CERTS_DIR)..." ; \
for f in ca.crt client.crt client.key; do \
[ -f "$(LOCAL_CORE_CERTS_DIR)/$$f" ] || { echo "ERROR: $$f not found in $(LOCAL_CORE_CERTS_DIR). Run gen-certs.sh in bare-metal-manager-core first."; exit 1; } ; \
done ; \
kubectl -n carbide-rest create secret generic core-grpc-client-site-agent-certs \
--from-file=ca.crt="$(LOCAL_CORE_CERTS_DIR)/ca.crt" \
--from-file=tls.crt="$(LOCAL_CORE_CERTS_DIR)/client.crt" \
--from-file=tls.key="$(LOCAL_CORE_CERTS_DIR)/client.key" \
--dry-run=client -o yaml | kubectl apply -f - ; \
Comment on lines +529 to +538
Copy link

Copilot AI Mar 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When LOCAL_CORE=true, this creates/updates the core-grpc-client-site-agent-certs Secret, but deploy/kustomize/base/site-agent/certificate.yaml also defines a cert-manager Certificate that continuously reconciles the same secret name. That controller will likely overwrite the manually provided local-core certs, causing flapping/connection failures. Consider deleting/omitting the Certificate resource when LOCAL_CORE=true (e.g., kubectl delete certificate core-grpc-client-site-agent-certs) or using a distinct secret name for local-core certs and patching the site-agent deployment to mount that secret instead.

Copilot uses AI. Check for mistakes.
CORE_HOST="$(LOCAL_CORE_HOST)" ; \
IPV4=$$(docker exec $$(docker ps -qf name=carbide-rest-local-control-plane) \
getent ahosts "$$CORE_HOST" 2>/dev/null \
| awk '/STREAM/{print $$1}' | grep -v ':' | head -1) ; \
if [ -n "$$IPV4" ]; then \
echo "Resolved $$CORE_HOST → $$IPV4 (using IPv4 to avoid Go IPv6-first dialer issue)" ; \
CORE_HOST="$$IPV4" ; \
fi ; \
Comment on lines +540 to +546
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Derive the control-plane container from KIND_CLUSTER_NAME.

Both LOCAL_CORE flows hardcode carbide-rest-local-control-plane instead of using the existing $(KIND_CLUSTER_NAME) variable. make KIND_CLUSTER_NAME=... ... LOCAL_CORE=true will resolve the wrong node and silently skip the IPv4 fallback.

Suggested fix
-		IPV4=$$(docker exec $$(docker ps -qf name=carbide-rest-local-control-plane) \
+		IPV4=$$(docker exec $$(docker ps -qf name=$(KIND_CLUSTER_NAME)-control-plane) \
 			getent ahosts "$$CORE_HOST" 2>/dev/null \
 			| awk '/STREAM/{print $$1}' | grep -v ':' | head -1) ; \

Also applies to: 626-632

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@Makefile` around lines 523 - 529, The Makefile logic that computes IPV4 is
hardcoded to the container name "carbide-rest-local-control-plane" instead of
deriving it from the existing KIND_CLUSTER_NAME, so running with make
KIND_CLUSTER_NAME=... LOCAL_CORE=true can target the wrong node; update the
docker exec invocation and any other places that reference
"carbide-rest-local-control-plane" (also at the later block around the same
logic) to compute the container name from $(KIND_CLUSTER_NAME) (e.g., use the
same templated container name construction you use elsewhere) so CORE_HOST
resolution uses the correct container, and ensure the variable CORE_HOST
assignment and echo message still reflect the substituted value.

echo "Patching site-agent ConfigMap: CARBIDE_ADDRESS=$$CORE_HOST:$(LOCAL_CORE_PORT), CARBIDE_SEC_OPT=1, SKIP_GRPC_SERVER_AUTH=true..." ; \
kubectl -n carbide-rest patch configmap carbide-rest-site-agent-config --type merge \
-p "{\"data\":{\"CARBIDE_ADDRESS\":\"$$CORE_HOST:$(LOCAL_CORE_PORT)\",\"CARBIDE_SEC_OPT\":\"1\",\"SKIP_GRPC_SERVER_AUTH\":\"true\"}}" ; \
echo "Restarting site-agent to pick up new CARBIDE_ADDRESS and certs..." ; \
kubectl -n carbide-rest rollout restart statefulset/carbide-rest-site-agent ; \
kubectl -n carbide-rest rollout status statefulset/carbide-rest-site-agent --timeout=120s ; \
fi

@echo ""
@echo "================================================================================"
@echo "Deployment complete! (Kustomize)"
@echo ""
@echo "Temporal UI: http://localhost:8233"
@echo "API: http://localhost:8388"
@echo "Keycloak: http://localhost:8082"
@if [ "$(LOCAL_CORE)" = "true" ]; then \
echo "Carbide Core: local carbide-api at $(LOCAL_CORE_HOST):$(LOCAL_CORE_PORT) (LOCAL_CORE=true)" ; \
else \
echo "Carbide Core: mock-core (in-cluster)" ; \
fi
@echo "================================================================================"

# =============================================================================
Expand All @@ -521,6 +582,11 @@ kind-reset-helm: kind-reset-infra
@echo "Temporal UI: http://localhost:8233"
@echo "API: http://localhost:8388"
@echo "Keycloak: http://localhost:8082"
@if [ "$(LOCAL_CORE)" = "true" ]; then \
echo "Carbide Core: local carbide-api at $(LOCAL_CORE_HOST):$(LOCAL_CORE_PORT) (LOCAL_CORE=true)" ; \
else \
echo "Carbide Core: mock-core (in-cluster)" ; \
fi
@echo "================================================================================"

# Default: full reset using Helm deployment
Expand Down Expand Up @@ -558,9 +624,35 @@ helm-deploy:
helm-deploy-site-agent:
@echo "Installing site-agent chart (will CrashLoop until bootstrapped)..."
helm upgrade --install carbide-rest-site-agent $(SITE_AGENT_CHART)/ \
--namespace carbide-rest $(HELM_SET) --timeout 1m || true
--namespace carbide-rest $(HELM_SET) \
$(if $(filter true,$(LOCAL_CORE)),--set certificate.enabled=false) \
--timeout 1m || true
Comment on lines 624 to +629
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🌐 Web query:

For Helm CLI, does helm upgrade --installfail when a workload enters CrashLoopBackOff if--waitis not set, and does--timeoutaffect upgrade/install behavior without--waitor--atomic?

💡 Result:

No, helm upgrade --install does not fail when a workload enters CrashLoopBackOff if --wait is not set. Yes, --timeout affects upgrade/install behavior without --wait or --atomic for individual Kubernetes operations like hooks.

Citations:


🌐 Web query:

helm upgrade --install failure modes errors without --wait template rendering validation

💡 Result:

Helm upgrade --install fails without --wait primarily due to post-install hook failures, especially when hooks depend on webhooks or resources that aren't immediately ready. Template rendering and validation errors cause failures before hooks. Key failure modes: 1. Post-install hook failures: Without --wait, Helm doesn't wait for main resources (Deployments, etc.) to be ready before running post-install hooks. Hooks like Jobs or webhooks fail if dependencies (e.g., webhook services) aren't available, causing UPGRADE FAILED: failed post-install. Adding --wait makes Helm wait for resources ready, then run hooks successfully. 2. Template rendering errors: Invalid YAML, nil pointer errors (missing values), type mismatches, undefined functions. Use helm template --debug, helm lint --strict to validate before upgrade. 3. Kubernetes validation errors: Rendered manifests fail schema validation, immutable field changes, unknown fields. Validate with helm template | kubeval --strict. 4. Timeout errors without --wait: Default 5m timeout hit if resources slow to ready; command fails even if eventually succeeds. 5. Resource conflicts: Stuck failed releases from prior attempts block upgrades. Without --wait, upgrade proceeds immediately after posting resources but marks failed if hooks fail post-posting. Official docs recommend --wait for production to ensure readiness. Debug: helm upgrade --dry-run --debug; helm template --debug; check release status with helm status. Prevention: Validate charts (lint, template, kubeval), use --wait/--timeout=15m, --atomic for rollback.

Citations:


Remove || true to surface template and validation errors.

helm upgrade --install without --wait returns after submitting resources (so CrashLoop won't cause helm to fail), but it still fails on template rendering errors, Kubernetes validation failures, and post-install hook issues. The || true suppresses these legitimate errors and allows the bootstrap to continue with a broken release.

Suggested fix
 	helm upgrade --install carbide-rest-site-agent $(SITE_AGENT_CHART)/ \
 		--namespace carbide-rest $(HELM_SET) \
 		$(if $(filter true,$(LOCAL_CORE)),--set certificate.enabled=false) \
-		--timeout 1m || true
+		--timeout 1m
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@Makefile` around lines 607 - 612, In the Makefile target
helm-deploy-site-agent, remove the trailing "|| true" from the helm upgrade
--install carbide-rest-site-agent ... --timeout 1m command so that template
rendering, validation, and hook errors are not suppressed; optionally consider
adding --wait and increasing the --timeout value if you want Helm to wait for
resources to become ready, but do not swallow failures by appending "|| true".

@echo "Running site bootstrap (setup-local.sh site-agent)..."
./scripts/setup-local.sh site-agent
@if [ "$(LOCAL_CORE)" = "true" ]; then \
echo "Creating core-grpc-client-site-agent-certs secret from $(LOCAL_CORE_CERTS_DIR)..." ; \
for f in ca.crt client.crt client.key; do \
[ -f "$(LOCAL_CORE_CERTS_DIR)/$$f" ] || { echo "ERROR: $$f not found in $(LOCAL_CORE_CERTS_DIR). Run gen-certs.sh in bare-metal-manager-core first."; exit 1; } ; \
done ; \
kubectl -n carbide-rest create secret generic core-grpc-client-site-agent-certs \
--from-file=ca.crt="$(LOCAL_CORE_CERTS_DIR)/ca.crt" \
--from-file=tls.crt="$(LOCAL_CORE_CERTS_DIR)/client.crt" \
--from-file=tls.key="$(LOCAL_CORE_CERTS_DIR)/client.key" \
--dry-run=client -o yaml | kubectl apply -f - ; \
CORE_HOST="$(LOCAL_CORE_HOST)" ; \
IPV4=$$(docker exec $$(docker ps -qf name=carbide-rest-local-control-plane) \
getent ahosts "$$CORE_HOST" 2>/dev/null \
| awk '/STREAM/{print $$1}' | grep -v ':' | head -1) ; \
if [ -n "$$IPV4" ]; then \
echo "Resolved $$CORE_HOST → $$IPV4 (using IPv4 to avoid Go IPv6-first dialer issue)" ; \
CORE_HOST="$$IPV4" ; \
fi ; \
echo "Patching site-agent ConfigMap: CARBIDE_ADDRESS=$$CORE_HOST:$(LOCAL_CORE_PORT), CARBIDE_SEC_OPT=1, SKIP_GRPC_SERVER_AUTH=true..." ; \
kubectl -n carbide-rest patch configmap carbide-rest-site-agent-config --type merge \
-p "{\"data\":{\"CARBIDE_ADDRESS\":\"$$CORE_HOST:$(LOCAL_CORE_PORT)\",\"CARBIDE_SEC_OPT\":\"1\",\"SKIP_GRPC_SERVER_AUTH\":\"true\"}}" ; \
echo "Restarting site-agent to pick up new CARBIDE_ADDRESS and certs..." ; \
kubectl -n carbide-rest rollout restart statefulset/carbide-rest-site-agent ; \
fi
@echo "Waiting for site-agent to stabilize..."
kubectl -n carbide-rest rollout status statefulset/carbide-rest-site-agent --timeout=120s

Expand Down
Loading
Loading