[OPIK-5757] [FE][SDK] Link experiment configuration to agent config versions (#6302)

miguelgrc · web-flow · commit d2deabb6eda0 · 2026-04-16T17:17:39.000+02:00
* [OPIK-5757] [FE][SDK] Link experiment configuration to agent config versions

When evaluate() or run_tests() is called with blueprint_id, the agent
configuration reference (_blueprint_id + blueprint_version) is stored in
experiment metadata. The experiment Configuration tab renders the version
as a clickable link to the agent configuration page.

* [OPIK-5757] [FE][SDK] Address PR review feedback

- Fix catch block in TS SDK to capture error details
- Guard against empty project_id in ConfigurationTab link data
- Add unit test for blueprint fetch failure graceful degradation

* [OPIK-5757] [FE][SDK] Address reviewer feedback: tests, types, cleanup

- Add happy path + failure unit test for Python _merge_blueprint_into_config
- Add blueprint metadata tests for TS SDK evaluate()
- Narrow agent_config type from Dict[str, Any] to Dict[str, str]
- Extract AgentConfigLinkData type import in ConfigurationTab
diff --git a/apps/opik-frontend/src/v2/pages-shared/experiments/CompareExperimentsConfigCell/CompareExperimentsConfigCell.tsx b/apps/opik-frontend/src/v2/pages-shared/experiments/CompareExperimentsConfigCell/CompareExperimentsConfigCell.tsx
@@ -1,6 +1,7 @@
 import React from "react";
 import isUndefined from "lodash/isUndefined";
 import { CellContext } from "@tanstack/react-table";
+import { Link, useParams } from "@tanstack/react-router";
 import CellWrapper from "@/shared/DataTableCells/CellWrapper";
 import { ROW_HEIGHT } from "@/types/shared";
 import TextDiff from "@/shared/CodeDiff/TextDiff";
@@ -16,26 +17,57 @@ export type CompareConfig = {
   different: boolean;
 };
 
+export type AgentConfigLinkData = {
+  projectId: string;
+  blueprintId: string;
+};
+
 type CustomMeta = {
   onlyDiff: boolean;
+  agentConfigLinkData?: Record<string, AgentConfigLinkData>;
 };
 
 const CompareExperimentsConfigCell: React.FC<
   CellContext<CompareConfig, unknown>
 > = (context) => {
   const { custom } = context.column.columnDef.meta ?? {};
-  const { onlyDiff } = (custom ?? {}) as CustomMeta;
+  const { onlyDiff, agentConfigLinkData } = (custom ?? {}) as CustomMeta;
   const experimentId = context.column?.id;
   const compareConfig = context.row.original;
+  const { workspaceName } = useParams({ strict: false }) as {
+    workspaceName?: string;
+  };
 
   const data = compareConfig.data[experimentId];
   const baseData = compareConfig.data[compareConfig.base];
+  const linkData =
+    compareConfig.name === "agent_configuration"
+      ? agentConfigLinkData?.[experimentId]
+      : undefined;
 
   const renderContent = () => {
     if (isUndefined(data)) {
       return <span className="px-1.5 py-2.5 text-light-slate">No value</span>;
     }
 
+    if (linkData && workspaceName) {
+      return (
+        <div className="comet-code size-full max-w-full overflow-hidden whitespace-pre-wrap break-words rounded-md border bg-code-block px-2 py-[11px]">
+          <Link
+            to="/$workspaceName/projects/$projectId/agent-configuration"
+            params={{
+              workspaceName,
+              projectId: linkData.projectId,
+            }}
+            search={{ configId: linkData.blueprintId }}
+            className="text-foreground underline hover:no-underline"
+          >
+            {data}
+          </Link>
+        </div>
+      );
+    }
+
     return (
       <div className="comet-code size-full max-w-full overflow-hidden whitespace-pre-wrap break-words rounded-md border bg-code-block px-2 py-[11px]">
         {showDiffView ? (
diff --git a/apps/opik-frontend/src/v2/pages/CompareExperimentsPage/ConfigurationTab/ConfigurationTab.tsx b/apps/opik-frontend/src/v2/pages/CompareExperimentsPage/ConfigurationTab/ConfigurationTab.tsx
@@ -6,6 +6,7 @@ import isObject from "lodash/isObject";
 import uniq from "lodash/uniq";
 import toLower from "lodash/toLower";
 import find from "lodash/find";
+import omit from "lodash/omit";
 import { flattie } from "flattie";
 
 import { COLUMN_TYPE, ColumnData } from "@/types/shared";
@@ -16,6 +17,7 @@ import CompareExperimentsActionsPanel from "@/v2/pages/CompareExperimentsPage/Co
 import CompareExperimentsConfigCell, {
   CompareConfig,
   CompareFiledValue,
+  AgentConfigLinkData,
 } from "@/v2/pages-shared/experiments/CompareExperimentsConfigCell/CompareExperimentsConfigCell";
 import PageBodyStickyContainer from "@/shared/PageBodyStickyContainer/PageBodyStickyContainer";
 import PageBodyStickyTableWrapper from "@/v2/layout/PageBodyStickyTableWrapper/PageBodyStickyTableWrapper";
@@ -28,6 +30,8 @@ import { Switch } from "@/ui/switch";
 import { Label } from "@/ui/label";
 import { Separator } from "@/ui/separator";
 import { EXPLAINER_ID, EXPLAINERS_MAP } from "@/constants/explainers";
+import { AGENT_CONFIGURATION_METADATA_KEY } from "@/utils/agent-configurations";
+import { isAgentConfigurationMetadata } from "@/v2/pages-shared/traces/TraceDetailsPanel/TraceDataViewer/AgentConfigurationTab";
 
 const COLUMNS_WIDTH_KEY = "compare-experiments-config-columns-width";
 
@@ -71,6 +75,21 @@ const ConfigurationTab: React.FunctionComponent<ConfigurationTabProps> = ({
     defaultValue: {},
   });
 
+  const agentConfigLinkData = useMemo(() => {
+    const result: Record<string, AgentConfigLinkData> = {};
+    for (const exp of experiments) {
+      const meta = exp.metadata as Record<string, unknown> | undefined;
+      const config = meta?.[AGENT_CONFIGURATION_METADATA_KEY];
+      if (isAgentConfigurationMetadata(config) && exp.project_id) {
+        result[exp.id] = {
+          projectId: exp.project_id,
+          blueprintId: config._blueprint_id,
+        };
+      }
+    }
+    return result;
+  }, [experiments]);
+
   const columns = useMemo(() => {
     const retVal = convertColumnDataToColumn<CompareConfig, CompareConfig>(
       DEFAULT_COLUMNS,
@@ -86,6 +105,7 @@ const ConfigurationTab: React.FunctionComponent<ConfigurationTabProps> = ({
           custom: {
             onlyDiff,
             experiment: find(experiments, (e) => e.id === id),
+            agentConfigLinkData,
           },
         },
         size: 400,
@@ -94,16 +114,30 @@ const ConfigurationTab: React.FunctionComponent<ConfigurationTabProps> = ({
     });
 
     return retVal;
-  }, [experimentsIds, onlyDiff, experiments]);
+  }, [experimentsIds, onlyDiff, experiments, agentConfigLinkData]);
 
   const flattenExperimentMetadataMap = useMemo(() => {
     return experiments.reduce<
       Record<string, Record<string, CompareFiledValue>>
     >((acc, experiment) => {
-      acc[experiment.id] = isObject(experiment.metadata)
-        ? flattie(experiment.metadata, ".", true)
+      const rawMetadata = experiment.metadata as
+        | Record<string, unknown>
+        | undefined;
+
+      const agentConfig = rawMetadata?.[AGENT_CONFIGURATION_METADATA_KEY];
+      const metadata = isObject(rawMetadata)
+        ? omit(rawMetadata, AGENT_CONFIGURATION_METADATA_KEY)
         : {};
 
+      acc[experiment.id] = isObject(metadata)
+        ? flattie(metadata, ".", true)
+        : {};
+
+      if (isAgentConfigurationMetadata(agentConfig)) {
+        acc[experiment.id]["agent_configuration"] =
+          agentConfig.blueprint_version ?? agentConfig._blueprint_id;
+      }
+
       return acc;
     }, {});
   }, [experiments]);
diff --git a/sdks/python/src/opik/api_objects/dataset/test_suite/test_suite.py b/sdks/python/src/opik/api_objects/dataset/test_suite/test_suite.py
@@ -623,6 +623,7 @@ def __internal_api__run_optimization_suite__(
         client: Optional["opik_client_module.Opik"] = None,
         generate_report: bool = True,
         report_output_path: Optional[str] = None,
+        blueprint_id: Optional[str] = None,
     ) -> suite_types.TestSuiteResult:
         """
         Internal entry point used by the optimizer framework.
@@ -648,4 +649,5 @@ def __internal_api__run_optimization_suite__(
             experiment_type=experiment_type,
             generate_report=generate_report,
             report_output_path=report_output_path,
+            blueprint_id=blueprint_id,
         )
diff --git a/sdks/python/src/opik/evaluation/evaluator.py b/sdks/python/src/opik/evaluation/evaluator.py
@@ -50,6 +50,26 @@
 EVALUATION_STREAM_DATASET_BATCH_SIZE = 200
 
 
+def _merge_blueprint_into_config(
+    client: "opik_client.Opik",
+    blueprint_id: str,
+    experiment_config: Optional[Dict[str, Any]],
+) -> Dict[str, Any]:
+    """Add blueprint reference to experiment_config under ``agent_configuration``."""
+    experiment_config = dict(experiment_config) if experiment_config else {}
+    agent_config: Dict[str, str] = {"_blueprint_id": blueprint_id}
+    try:
+        blueprint = client._rest_client.agent_configs.get_blueprint_by_id(
+            blueprint_id=blueprint_id,
+        )
+        if blueprint.name:
+            agent_config["blueprint_version"] = blueprint.name
+    except Exception:
+        LOGGER.debug("Failed to fetch blueprint %s", blueprint_id, exc_info=True)
+    experiment_config["agent_configuration"] = agent_config
+    return experiment_config
+
+
 def _calculate_total_items(
     dataset_: Union[dataset.Dataset, dataset.DatasetVersion],
     nb_samples: Optional[int],
@@ -170,6 +190,7 @@ def evaluate(
     experiment_scoring_functions: Optional[List[ExperimentScoreFunction]] = None,
     experiment_tags: Optional[List[str]] = None,
     dataset_filter_string: Optional[str] = None,
+    blueprint_id: Optional[str] = None,
 ) -> evaluation_result.EvaluationResult:
     """
     Performs task evaluation on a given dataset. You can use either `scoring_metrics` or `scorer_functions` to calculate
@@ -271,6 +292,13 @@ def evaluate(
 
     client = opik_client.get_global_client()
 
+    if blueprint_id:
+        experiment_config = _merge_blueprint_into_config(
+            client,
+            blueprint_id,
+            experiment_config,
+        )
+
     experiment_name = _use_or_create_experiment_name(
         experiment_name=experiment_name,
         experiment_name_prefix=experiment_name_prefix,
@@ -333,6 +361,7 @@ def __internal_api__run_test_suite__(
     experiment_type: Optional[str] = None,
     generate_report: bool = True,
     report_output_path: Optional[str] = None,
+    blueprint_id: Optional[str] = None,
 ) -> "suite_types.TestSuiteResult":
     """
     Internal function that runs the full test suite evaluation pipeline:
@@ -352,6 +381,13 @@ def __internal_api__run_test_suite__(
     if client is None:
         client = opik_client.get_global_client()
 
+    if blueprint_id:
+        experiment_config = _merge_blueprint_into_config(
+            client,
+            blueprint_id,
+            experiment_config,
+        )
+
     @functools.wraps(task)
     def _validated_task(data: Dict[str, Any]) -> Any:
         return validate_task_result(task(data), input_data=data)
@@ -445,6 +481,7 @@ def run_tests(
     model: Optional[str] = None,
     generate_report: bool = True,
     report_output_path: Optional[str] = None,
+    blueprint_id: Optional[str] = None,
 ) -> "suite_types.TestSuiteResult":
     """
     Run a test suite against a task function.
@@ -505,6 +542,7 @@ def run_tests(
         evaluator_model=model,
         generate_report=generate_report,
         report_output_path=report_output_path,
+        blueprint_id=blueprint_id,
     )
 
 
diff --git a/sdks/python/tests/unit/evaluation/test_evaluate.py b/sdks/python/tests/unit/evaluation/test_evaluate.py
@@ -3538,3 +3538,44 @@ def say_task(item: Dict[str, Any]):
         desc=mock.ANY,
         total=mock.ANY,
     )
+
+
+class TestMergeBlueprintIntoConfig:
+    @staticmethod
+    def _make_blueprint(id, name):
+        bp = mock.MagicMock()
+        bp.id = id
+        bp.name = name
+        return bp
+
+    def test_blueprint_fetched_and_version_stored(self):
+        mock_client = mock.Mock()
+        mock_client._rest_client.agent_configs.get_blueprint_by_id.return_value = (
+            self._make_blueprint("bp-123", "v9")
+        )
+
+        result = evaluator_module._merge_blueprint_into_config(
+            mock_client,
+            "bp-123",
+            {"model": "gpt-4o"},
+        )
+
+        assert result["model"] == "gpt-4o"
+        assert result["agent_configuration"] == {
+            "_blueprint_id": "bp-123",
+            "blueprint_version": "v9",
+        }
+
+    def test_blueprint_fetch_fails_still_stores_id(self):
+        mock_client = mock.Mock()
+        mock_client._rest_client.agent_configs.get_blueprint_by_id.side_effect = (
+            Exception("not found")
+        )
+
+        result = evaluator_module._merge_blueprint_into_config(
+            mock_client,
+            "bp-456",
+            None,
+        )
+
+        assert result["agent_configuration"] == {"_blueprint_id": "bp-456"}
diff --git a/sdks/typescript/src/opik/evaluation/evaluate.ts b/sdks/typescript/src/opik/evaluation/evaluate.ts
@@ -55,6 +55,9 @@ export interface EvaluateOptions<T = Record<string, unknown>> {
 
   /** Optional list of tags to associate with the experiment */
   tags?: string[];
+
+  /** Optional agent configuration blueprint ID to link with the experiment */
+  blueprintId?: string;
 }
 
 export async function evaluate<T = Record<string, unknown>>(
@@ -72,14 +75,27 @@ export async function evaluate<T = Record<string, unknown>>(
   // Get Opik client
   const client = options.client ?? OpikSingleton.getInstance();
 
+  // Resolve agent config blueprint if provided
+  let experimentConfig = options.experimentConfig;
+  if (options.blueprintId) {
+    const agentConfig: Record<string, string> = { _blueprint_id: options.blueprintId };
+    try {
+      const blueprint = await client.api.agentConfigs.getBlueprintById(options.blueprintId);
+      if (blueprint.name) agentConfig.blueprint_version = blueprint.name;
+    } catch (error) {
+      logger.debug(`Failed to fetch blueprint ${options.blueprintId}: ${error}`);
+    }
+    experimentConfig = { ...experimentConfig, agent_configuration: agentConfig };
+  }
+
   // Get version info for experiment linking
   const versionInfo = await options.dataset.getVersionInfo();
 
   // Create experiment for this evaluation run
   const experiment = await client.createExperiment({
     name: options.experimentName,
     datasetName: options.dataset.name,
-    experimentConfig: options.experimentConfig,
+    experimentConfig,
     prompts: options.prompts,
     datasetVersionId: versionInfo?.id,
     tags: options.tags,
diff --git a/sdks/typescript/tests/evaluation/evaluate.test.ts b/sdks/typescript/tests/evaluation/evaluate.test.ts