Skip to content

Commit 743e29f

Browse files
feat: use HostAliases for api-int DNS resolution in Holmes pod
Replace the dev-mode MakeExternalKubeconfig() workaround with pod-level HostAliases that map api-int.* to the cluster's APIServerPrivateEndpointIP. This bypasses DNS entirely while preserving TLS certificate validation, since the cert is issued for the api-int.* hostname. This mirrors the portal's DialContext pattern and Hive's APIServerIPOverride, and works in all environments (dev, staging, production) without depending on Private DNS zone linking.
1 parent 87777f4 commit 743e29f

7 files changed

Lines changed: 42 additions & 157 deletions

File tree

pkg/frontend/admin_openshiftcluster_investigate.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,13 +144,15 @@ func (f *frontend) _postAdminOpenShiftClusterInvestigate(ctx context.Context, r
144144
return fmt.Errorf("failed to generate diagnostics kubeconfig: %w", err)
145145
}
146146

147+
apiServerIP := doc.OpenShiftCluster.Properties.NetworkProfile.APIServerPrivateEndpointIP
148+
147149
log.Infof("starting Holmes investigation for cluster %s (question_length=%d)", resourceID, len(req.Question))
148150

149151
// Set Content-Type before streaming begins. Once bytes are written to w,
150152
// the response is committed and errors cannot be reported via adminReply.
151153
w.Header().Set("Content-Type", "text/plain")
152154

153-
err = f.hiveClusterManager.InvestigateCluster(ctx, hiveNamespace, kubeconfig, f.holmesConfig, req.Question, w)
155+
err = f.hiveClusterManager.InvestigateCluster(ctx, hiveNamespace, kubeconfig, f.holmesConfig, apiServerIP, req.Question, w)
154156
if err != nil {
155157
return fmt.Errorf("failed to investigate cluster: %w", err)
156158
}

pkg/frontend/admin_openshiftcluster_investigate_kubeconfig.go

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ import (
1515
"github.com/Azure/ARO-RP/pkg/cluster/graph"
1616
"github.com/Azure/ARO-RP/pkg/env"
1717
"github.com/Azure/ARO-RP/pkg/util/encryption"
18-
"github.com/Azure/ARO-RP/pkg/util/holmes"
1918
"github.com/Azure/ARO-RP/pkg/util/storage"
2019
"github.com/Azure/ARO-RP/pkg/util/stringutils"
2120
)
@@ -66,15 +65,5 @@ func (f *frontend) generateDiagnosticsKubeconfig(ctx context.Context, log *logru
6665
return nil, fmt.Errorf("failed to generate diagnostics kubeconfig: %w", err)
6766
}
6867

69-
// In development mode, the Hive cluster cannot resolve api-int.* private DNS
70-
// names, so we rewrite to the external api.* endpoint. In production, the
71-
// Hive cluster has proper network connectivity and should use api-int.* directly.
72-
if f.env.IsLocalDevelopmentMode() {
73-
kubeconfig, err = holmes.MakeExternalKubeconfig(kubeconfig)
74-
if err != nil {
75-
return nil, fmt.Errorf("failed to convert to external kubeconfig: %w", err)
76-
}
77-
}
78-
7968
return kubeconfig, nil
8069
}

pkg/hive/investigate.go

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import (
77
"context"
88
"fmt"
99
"io"
10+
"net/url"
1011
"time"
1112

1213
_ "embed"
@@ -17,6 +18,9 @@ import (
1718
"k8s.io/apimachinery/pkg/api/resource"
1819
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1920
"k8s.io/apimachinery/pkg/util/wait"
21+
clientcmdv1 "k8s.io/client-go/tools/clientcmd/api/v1"
22+
23+
"sigs.k8s.io/yaml"
2024

2125
"github.com/Azure/ARO-RP/pkg/util/holmes"
2226
"github.com/Azure/ARO-RP/pkg/util/pointerutils"
@@ -28,7 +32,7 @@ var holmesConfigYAML string
2832
// InvestigateCluster creates an investigation pod on the Hive cluster, streams its logs, and cleans up.
2933
// It accepts kubeconfig bytes, creates a temporary secret to hold them, and removes
3034
// the secret (along with the pod and configmap) when the investigation completes.
31-
func (hr *clusterManager) InvestigateCluster(ctx context.Context, hiveNamespace string, kubeconfig []byte, holmesConfig *holmes.HolmesConfig, question string, w io.Writer) error {
35+
func (hr *clusterManager) InvestigateCluster(ctx context.Context, hiveNamespace string, kubeconfig []byte, holmesConfig *holmes.HolmesConfig, apiServerIP string, question string, w io.Writer) error {
3236
id := uuid.New().String()[:8]
3337
configMapName := "holmes-config-" + id
3438
podName := "holmes-investigate-" + id
@@ -98,6 +102,12 @@ func (hr *clusterManager) InvestigateCluster(ctx context.Context, hiveNamespace
98102
// 2. Create the investigation pod.
99103
activeDeadlineSeconds := int64(holmesConfig.DefaultTimeout)
100104
runAsUser := int64(1000)
105+
106+
apiHostname, err := apiServerHostname(kubeconfig)
107+
if err != nil {
108+
return fmt.Errorf("failed to extract API server hostname from kubeconfig: %w", err)
109+
}
110+
101111
pod := &corev1.Pod{
102112
ObjectMeta: metav1.ObjectMeta{
103113
Name: podName,
@@ -107,6 +117,12 @@ func (hr *clusterManager) InvestigateCluster(ctx context.Context, hiveNamespace
107117
AutomountServiceAccountToken: pointerutils.ToPtr(false),
108118
ActiveDeadlineSeconds: &activeDeadlineSeconds,
109119
RestartPolicy: corev1.RestartPolicyNever,
120+
HostAliases: []corev1.HostAlias{
121+
{
122+
IP: apiServerIP,
123+
Hostnames: []string{apiHostname},
124+
},
125+
},
110126
SecurityContext: &corev1.PodSecurityContext{
111127
RunAsUser: &runAsUser,
112128
RunAsGroup: &runAsUser,
@@ -325,3 +341,20 @@ func (hr *clusterManager) streamPodLogs(ctx context.Context, namespace, name str
325341

326342
return nil
327343
}
344+
345+
func apiServerHostname(kubeconfig []byte) (string, error) {
346+
var cfg clientcmdv1.Config
347+
if err := yaml.Unmarshal(kubeconfig, &cfg); err != nil {
348+
return "", fmt.Errorf("failed to unmarshal kubeconfig: %w", err)
349+
}
350+
if len(cfg.Clusters) == 0 {
351+
return "", fmt.Errorf("kubeconfig has no clusters")
352+
}
353+
354+
u, err := url.Parse(cfg.Clusters[0].Cluster.Server)
355+
if err != nil {
356+
return "", fmt.Errorf("failed to parse server URL: %w", err)
357+
}
358+
359+
return u.Hostname(), nil
360+
}

pkg/hive/manager.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ type ClusterManager interface {
5555
GetClusterSync(ctx context.Context, oc *api.OpenShiftCluster) (*hivev1alpha1.ClusterSync, error)
5656
ListHiveK8sObjects(ctx context.Context, resource, namespace string) ([]byte, error)
5757
GetHiveK8sObject(ctx context.Context, resource, namespace, name string) ([]byte, error)
58-
InvestigateCluster(ctx context.Context, hiveNamespace string, kubeconfig []byte, holmesConfig *holmes.HolmesConfig, question string, w io.Writer) error
58+
InvestigateCluster(ctx context.Context, hiveNamespace string, kubeconfig []byte, holmesConfig *holmes.HolmesConfig, apiServerIP string, question string, w io.Writer) error
5959
}
6060

6161
type clusterManager struct {

pkg/util/holmes/kubeconfig.go

Lines changed: 0 additions & 41 deletions
This file was deleted.

pkg/util/holmes/kubeconfig_test.go

Lines changed: 0 additions & 98 deletions
This file was deleted.

pkg/util/mocks/hive/hive.go

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)