Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 64 additions & 39 deletions backend/app/api/routes/ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
from typing import Any, Dict, List, Optional

from fastapi import APIRouter, HTTPException
from pydantic import BaseModel
from pydantic import BaseModel, Field
from kubernetes.client.exceptions import ApiException

from app.core.config import settings
from app.core.cache import cache_service
Expand All @@ -14,7 +15,7 @@
from app.services.security_service import get_security_service
from app.services.jenkins_service import jenkins_service
from app.services.helm_service import helm_service
# from app.services.cost_service import cost_service # Excluded from v1.4.0
from app.services.cost_service import cost_service
from app.services.timeline_service import timeline_service
from app.services.optimization_service import optimization_service

Expand All @@ -24,8 +25,17 @@


class ChatRequest(BaseModel):
message: str
context: Optional[str] = None
message: str = Field(
...,
min_length=1,
max_length=10000,
description="User message to the AI assistant"
)
context: Optional[str] = Field(
None,
max_length=50000,
description="Additional context for the AI (e.g., logs, resources)"
)


class ChatResponse(BaseModel):
Expand All @@ -46,7 +56,7 @@ def get_groq_client():
raise ValueError("GROQ_API_KEY not configured")
try:
from groq import Groq
except Exception as e:
except ImportError as e:
raise ValueError("groq package is not available. Install it with: pip install groq") from e

_groq_client = Groq(api_key=settings.GROQ_API_KEY)
Expand All @@ -61,7 +71,7 @@ def get_gemini_model():
raise ValueError("GEMINI_API_KEY not configured")
try:
import google.generativeai as genai # type: ignore
except Exception as e:
except ImportError as e:
raise ValueError("google.generativeai package is not available") from e

genai.configure(api_key=settings.GEMINI_API_KEY)
Expand Down Expand Up @@ -127,14 +137,20 @@ def generate_ai_response(prompt: str, fallback_providers: list = None) -> str:
logger.warning(f"Unknown AI provider: {provider}")
continue

except Exception as e:
except (ValueError, OSError) as e:
# OSError includes ConnectionError, TimeoutError, and network errors
logger.warning(f"AI provider {provider} failed: {e}")
last_error = e
continue
except Exception as e:
# Catch any other API-specific exceptions (rate limits, auth errors, etc.)
logger.warning(f"AI provider {provider} failed with unexpected error: {e}")
last_error = e
continue

# If we get here, all providers failed
if last_error:
raise Exception(f"All AI providers failed. Last error: {last_error}")
raise RuntimeError(f"All AI providers failed. Last error: {last_error}")
else:
raise ValueError("No AI providers configured. Please set GROQ_API_KEY or GEMINI_API_KEY")

Expand All @@ -161,7 +177,7 @@ def generate_ai_response(prompt: str, fallback_providers: list = None) -> str:
# Helm
"helm": ["helm", "chart", "charts", "release", "releases", "helm install", "helm upgrade"],
# Cost - Excluded from v1.4.0
# "cost": ["cost", "costs", "spending", "expensive", "billing", "price", "budget", "savings"],
"cost": ["cost", "costs", "spending", "expensive", "billing", "price", "budget", "savings"],
# Timeline / Events
"timeline": ["timeline", "history", "activity", "recent events", "what happened"],
# Incidents
Expand Down Expand Up @@ -246,7 +262,7 @@ async def fetch_context(query_types: list, specific_pod_name: Optional[str] = No
pod_details += f"- [{event.type}] **{event.reason}**: {event.message}\n"
else:
pod_details += "\n### Recent Events:\nNo events found for this pod.\n"
except Exception as e:
except (ApiException, ValueError, KeyError) as e:
logger.warning(f"Could not fetch events for pod {pod.name}: {e}")
pod_details += f"\n### Recent Events:\nCould not fetch events: {e}\n"

Expand All @@ -264,7 +280,7 @@ async def fetch_context(query_types: list, specific_pod_name: Optional[str] = No
No pods found matching "{specific_pod_name}". Please check the pod name and try again.
You can list all pods with: `kubectl get pods -A`
""")
except Exception as e:
except (ApiException, ValueError, AttributeError) as e:
logger.error(f"Error fetching specific pod details: {e}")
context_parts.append(f"## Error\nCould not fetch details for pod '{specific_pod_name}': {e}\n")

Expand All @@ -285,7 +301,7 @@ async def fetch_context(query_types: list, specific_pod_name: Optional[str] = No
- **Warnings**: {', '.join(health.warnings) if health.warnings else 'None'}
"""
)
except Exception as e:
except (ApiException, AttributeError) as e:
logger.warning(f"Could not fetch cluster health: {e}")

if "pods" in query_types and not specific_pod_name:
Expand Down Expand Up @@ -323,7 +339,7 @@ async def fetch_context(query_types: list, specific_pod_name: Optional[str] = No
]
)
)
except Exception as e:
except (ApiException, AttributeError, ValueError) as e:
logger.warning(f"Could not fetch pods: {e}")

# Fetch failing pods specifically when asked
Expand Down Expand Up @@ -423,7 +439,7 @@ async def fetch_context(query_types: list, specific_pod_name: Optional[str] = No
βœ… Great news! No failing pods detected. All pods are either Running or Succeeded.
""")

except Exception as e:
except (ApiException, AttributeError, ValueError) as e:
logger.error(f"Could not fetch failing pods: {e}")
context_parts.append(f"## Error\nCould not fetch failing pods: {e}\n")

Expand Down Expand Up @@ -453,7 +469,7 @@ async def fetch_context(query_types: list, specific_pod_name: Optional[str] = No
else "All deployments are healthy!"
)
)
except Exception as e:
except (ApiException, AttributeError) as e:
logger.warning(f"Could not fetch deployments: {e}")

if "services" in query_types:
Expand All @@ -471,7 +487,7 @@ async def fetch_context(query_types: list, specific_pod_name: Optional[str] = No
- **By Type**: {', '.join([f'{t}: {c}' for t, c in svc_types.items()])}
"""
)
except Exception as e:
except (ApiException, AttributeError) as e:
logger.warning(f"Could not fetch services: {e}")

if "nodes" in query_types:
Expand All @@ -492,7 +508,7 @@ async def fetch_context(query_types: list, specific_pod_name: Optional[str] = No
[f"- **{n.name}**: {n.status}, Roles: {', '.join(n.roles)}, K8s: {n.version}" for n in nodes]
)
)
except Exception as e:
except (ApiException, AttributeError) as e:
logger.warning(f"Could not fetch nodes: {e}")

if "namespaces" in query_types:
Expand All @@ -505,7 +521,7 @@ async def fetch_context(query_types: list, specific_pod_name: Optional[str] = No
- **Namespaces**: {', '.join([ns.name for ns in namespaces])}
"""
)
except Exception as e:
except (ApiException, AttributeError) as e:
logger.warning(f"Could not fetch namespaces: {e}")

if "resources" in query_types:
Expand All @@ -519,10 +535,10 @@ async def fetch_context(query_types: list, specific_pod_name: Optional[str] = No
- **Memory**: {metrics.total_memory_usage} / {metrics.total_memory_capacity} ({metrics.memory_percent}%)
"""
)
except Exception as e:
except (ApiException, AttributeError) as e:
logger.warning(f"Could not fetch metrics: {e}")

except Exception as e:
except (ApiException, AttributeError, ValueError) as e:
logger.error(f"Error fetching Kubernetes context: {e}")

# ===== SECURITY CONTEXT =====
Expand All @@ -549,7 +565,7 @@ async def fetch_context(query_types: list, specific_pod_name: Optional[str] = No
"""
+ "\n".join([f"- [{f.severity.upper()}] {f.title}" for f in dashboard.top_findings[:5]])
)
except Exception as e:
except (ApiException, AttributeError, ValueError) as e:
logger.warning(f"Could not fetch security dashboard: {e}")

if "rbac" in query_types:
Expand All @@ -567,7 +583,7 @@ async def fetch_context(query_types: list, specific_pod_name: Optional[str] = No
"""
+ "\n".join([f"- {r}" for r in rbac.recommendations[:3]])
)
except Exception as e:
except (ApiException, AttributeError) as e:
logger.warning(f"Could not fetch RBAC analysis: {e}")

if "network_policy" in query_types:
Expand All @@ -582,10 +598,10 @@ async def fetch_context(query_types: list, specific_pod_name: Optional[str] = No
- **Pods Covered**: {np.covered_pods}/{np.total_pods}
"""
)
except Exception as e:
except (ApiException, AttributeError) as e:
logger.warning(f"Could not fetch network policies: {e}")

except Exception as e:
except (ApiException, AttributeError, ValueError) as e:
logger.error(f"Error fetching security context: {e}")

# ===== JENKINS CONTEXT =====
Expand Down Expand Up @@ -614,7 +630,7 @@ async def fetch_context(query_types: list, specific_pod_name: Optional[str] = No
else "No failed jobs!"
)
)
except Exception as e:
except (OSError, ValueError, AttributeError) as e:
logger.warning(f"Could not fetch Jenkins data: {e}")

# ===== HELM CONTEXT =====
Expand All @@ -637,17 +653,18 @@ async def fetch_context(query_types: list, specific_pod_name: Optional[str] = No
[f"- **{r.name}** ({r.namespace}): {r.chart} v{r.app_version} - {r.status}" for r in releases[:10]]
)
)
except Exception as e:
except (OSError, ValueError, AttributeError) as e:
logger.warning(f"Could not fetch Helm data: {e}")

# ===== COST CONTEXT ===== (Excluded from v1.4.0)
# if "cost" in query_types:
# try:
# dashboard = await cost_service.get_cost_dashboard()
# recommendations = await cost_service.get_recommendations()
# context_parts.append(...)
# except Exception as e:
# logger.warning(f"Could not fetch cost data: {e}")
if "cost" in query_types:
try:
dashboard = await cost_service.get_cost_dashboard()
recommendations = await cost_service.get_recommendations()

context_parts.append(f"\n## Cost Analysis\nDashboard Summary: {dashboard}\nOptimizations: {recommendations}\n")
except (OSError, ValueError, AttributeError) as e:
logger.warning(f"Could not fetch cost data: {e}")

# ===== TIMELINE CONTEXT =====
if "timeline" in query_types:
Expand All @@ -667,7 +684,7 @@ async def fetch_context(query_types: list, specific_pod_name: Optional[str] = No
"""
+ "\n".join([f"- [{e.event_type}] {e.title}" for e in events[:5]])
)
except Exception as e:
except (OSError, ValueError, AttributeError) as e:
logger.warning(f"Could not fetch timeline data: {e}")

return "\n".join(context_parts) if context_parts else ""
Expand Down Expand Up @@ -736,9 +753,13 @@ async def chat(request: ChatRequest):
except ValueError as e:
logger.error(f"Configuration error: {e}")
raise HTTPException(status_code=503, detail="AI service not configured. Please set GROQ_API_KEY or GEMINI_API_KEY.")
except Exception as e:
except (RuntimeError, OSError, AttributeError) as e:
logger.error(f"AI chat error: {e}")
raise HTTPException(status_code=500, detail=f"AI service error: {str(e)}")
except Exception as e:
# Catch-all for unexpected errors in the endpoint
logger.exception(f"Unexpected AI chat error: {e}")
raise HTTPException(status_code=500, detail="An unexpected error occurred")


@router.get("/health")
Expand All @@ -754,25 +775,29 @@ async def ai_health():
try:
health = await kubernetes_service.get_cluster_health()
services_status["kubernetes"] = "connected" if health else "disconnected"
except Exception:
except (ApiException, OSError, AttributeError):
services_status["kubernetes"] = "disconnected"

try:
jenkins_health = await jenkins_service.get_health()
services_status["jenkins"] = "connected" if jenkins_health.connected else "disconnected"
except Exception:
except (OSError, AttributeError):
services_status["jenkins"] = "disconnected"

try:
await helm_service.list_releases()
services_status["helm"] = "connected"
except Exception:
except (OSError, AttributeError):
services_status["helm"] = "disconnected"

return {"status": "available", "model": settings.GEMINI_MODEL, "services": services_status}
except Exception as e:
except (ValueError, AttributeError, OSError) as e:
logger.error(f"AI health check error: {type(e).__name__}")
return {"status": "error", "reason": "Internal service error"}
except Exception as e:
# Catch-all for health check endpoint
logger.exception(f"Unexpected health check error: {e}")
return {"status": "error", "reason": "Unexpected error"}


class OptimizationAnalysisRequest(BaseModel):
Expand Down
5 changes: 5 additions & 0 deletions backend/app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ class Settings(BaseSettings):
K8S_IN_CLUSTER: bool = False
K8S_HOST_OVERRIDE: Optional[str] = None # Use 'host.docker.internal' for Docker Desktop

# Optimization Metrics Source
OPTIMIZATION_METRICS_SOURCE: str = "metrics-server" # Options: "metrics-server", "prometheus"
PROMETHEUS_NAMESPACE: str = "monitoring" # Namespace where Prometheus is deployed
PROMETHEUS_RELEASE_NAME: str = "prometheus-stack" # Helm release name

# Jenkins
JENKINS_URL: str = "http://localhost:8080"
JENKINS_USERNAME: str = ""
Expand Down
11 changes: 7 additions & 4 deletions backend/app/core/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,16 +268,19 @@ class SecurityHeadersMiddleware(BaseHTTPMiddleware):
"""Add security headers to all responses."""

# Content Security Policy
# Note: Removed 'unsafe-inline' and 'unsafe-eval' for security
# If frontend needs inline scripts, use nonce-based CSP instead
CSP_POLICY = (
"default-src 'self'; "
"script-src 'self' 'unsafe-inline' 'unsafe-eval'; "
"style-src 'self' 'unsafe-inline'; "
"script-src 'self'; "
"style-src 'self'; "
"img-src 'self' data: https:; "
"font-src 'self' data:; "
"connect-src 'self' https:; "
"connect-src 'self' https: wss:; "
"frame-ancestors 'none'; "
"base-uri 'self'; "
"form-action 'self'"
"form-action 'self'; "
"object-src 'none'"
)

async def dispatch(self, request: Request, call_next: Callable) -> Response:
Expand Down
Loading
Loading