comet-ml · yaricom · Apr 7, 2026 · Apr 3, 2026 · Apr 3, 2026
@@ -26,22 +26,46 @@
 LLMTask = Callable[[Dict[str, Any]], Any]
 
 
-def validate_task_result(result: Any) -> Dict[str, Any]:
-    if not isinstance(result, dict):
-        raise TypeError(
-            f"The task function must return a dict with 'input' and "
-            f"'output' keys, but it returned {type(result).__name__}. "
-            f"Example: return {{'input': data, 'output': response}}"
-        )
-    missing = {"input", "output"} - result.keys()
-    if missing:
-        raise ValueError(
-            f"The task function must return a dict with 'input' and "
-            f"'output' keys, but the returned dict is missing: "
-            f"{missing}. Got keys: {set(result.keys())}. "
-            f"Example: return {{'input': data, 'output': response}}"
-        )
-    return result
+def validate_task_result(
+    result: Any,
+    input_data: Any = None,
+) -> Dict[str, Any]:
+    """Normalise the value returned by a task function into a result dict.
+
+    If *result* is already a :class:`dict`, it is returned as-is (the
+    supported keys are ``"input"`` and ``"output"``).
+
+    For any other type the value is wrapped automatically::
+
+        {"output": result}
+
+    When *input_data* is also provided the wrapper becomes::
+
+        {"input": input_data, "output": result}
+
+    Args:
+        result: Value returned by the task callable.
+        input_data: Optional input that was passed to the task. Included in
+            the wrapper dict as ``"input"`` when *result* is not a dict.
+
+    Returns:
+        A dict suitable for use as an experiment trace result.
+    """
+    if isinstance(result, dict):
+        missing = {"input", "output"} - result.keys()
+        if missing:
+            raise ValueError(
+                f"The task function must return a dict with 'input' and "
+                f"'output' keys, but the returned dict is missing: "
+                f"{missing}. Got keys: {set(result.keys())}. "
+                f"Example: return {{'input': data, 'output': response}}"
+            )
+        return result
+
+    wrapped: Dict[str, Any] = {"output": result}
+    if input_data is not None:
+        wrapped["input"] = input_data
+    return wrapped
 
 
 class EvaluationSuite:
@@ -408,11 +432,14 @@ def run(
         Run the evaluation suite against a task function.
 
         The task function receives each test item's data dict and must return
-        a dict with "input" and "output" keys.
+        either a dict (with ``"input"`` and ``"output"`` as the supported keys)
+        or any other value, which will be automatically wrapped as
+        ``{"input": <item data>, "output": <returned value>}``.
 
         Args:
             task: A callable that takes a dict (the item's data) and returns
-                a dict with "input" and "output" keys.
+                a dict with ``"input"``/``"output"`` keys, or any other value
+                to be auto-wrapped.
             experiment_name_prefix: Optional prefix for auto-generated experiment name.
             experiment_name: Optional explicit name for the experiment.
             project_name: Optional project name for tracking.
@@ -434,7 +461,12 @@ def run(
             EvaluationSuiteResult with pass/fail status based on execution policy.
 
         Example:
-            >>> def my_llm_task(data: dict) -> dict:
+            >>> # Simplified: return any value it is auto-wrapped internally
+            >>> def my_llm_task(data: dict) -> str:
+            ...     return call_my_llm(data["user_input"], user_tier=data.get("user_tier"))
+            >>>
+            >>> # Explicit: return a dict with "input" and "output" keys directly
+            >>> def my_llm_task_explicit(data: dict) -> dict:
             ...     response = call_my_llm(data["user_input"], user_tier=data.get("user_tier"))
             ...     return {"input": data, "output": response}
             >>>
@@ -489,7 +521,8 @@ def __internal_api__run_optimization_suite__(
 
         Args:
             task: A callable that takes a dict (the item's data) and returns
-                a dict with "input" and "output" keys.
+                a dict with ``"input"``/``"output"`` keys, or any other value
+                to be auto-wrapped.
             experiment_name_prefix: Optional prefix for auto-generated experiment name.
             experiment_name: Optional explicit name for the experiment.
             project_name: Optional project name for tracking.
@@ -517,7 +550,7 @@ def __internal_api__run_optimization_suite__(
 
         @functools.wraps(task)
         def _validated_task(data: Dict[str, Any]) -> Any:
-            return validate_task_result(task(data))
+            return validate_task_result(task(data), input_data=data)
 
         suite_result = opik_evaluator.evaluate_suite(
             dataset=self._dataset,

@@ -1 +1,2 @@
+anthropic>=0.88.0  # AttributeError: 'OpenAICompletion' object has no attribute 'client'
 crewai[anthropic, bedrock, google-genai]>=1.0.0
@@ -623,25 +623,58 @@ def test_build_suite_result__integer_score_zero__run_fails(self):
 
 
 class TestValidateTaskResult:
-    def test_non_dict__raises_type_error(self):
-        with pytest.raises(
-            TypeError, match="must return a dict with 'input' and 'output'"
-        ):
-            evaluation_suite.validate_task_result("just a string")
+    # --- dict results ---
+
+    def test_validate_task_result__valid_dict__returns_as_is(self):
+        result = evaluation_suite.validate_task_result(
+            {"input": "hello", "output": "world"}
+        )
+        assert result == {"input": "hello", "output": "world"}
 
-    def test_missing_keys__raises_value_error(self):
+    def test_validate_task_result__dict_missing_output__raises_value_error(self):
         with pytest.raises(ValueError, match="missing.*output"):
             evaluation_suite.validate_task_result({"input": "hello"})
 
-    def test_missing_both_keys__raises_value_error(self):
+    def test_validate_task_result__dict_missing_both_keys__raises_value_error(self):
         with pytest.raises(ValueError, match="missing"):
             evaluation_suite.validate_task_result({"response": "hello"})
 
-    def test_valid_result__returns_dict(self):
+    def test_validate_task_result__dict_missing_input__raises_value_error(self):
+        with pytest.raises(ValueError, match="missing.*input"):
+            evaluation_suite.validate_task_result({"output": "world"})
+
+    # --- non-dict results: auto-wrapping ---
+
+    def test_validate_task_result__string_result__wrapped_as_output(self):
+        result = evaluation_suite.validate_task_result("just a string")
+        assert result == {"output": "just a string"}
+
+    def test_validate_task_result__string_result_with_input_data__wrapped_with_input(
+        self,
+    ):
         result = evaluation_suite.validate_task_result(
-            {"input": "hello", "output": "world"}
+            "just a string", input_data={"question": "hi"}
         )
-        assert result == {"input": "hello", "output": "world"}
+        assert result == {"input": {"question": "hi"}, "output": "just a string"}
+
+    def test_validate_task_result__int_result__wrapped_as_output(self):
+        result = evaluation_suite.validate_task_result(42)
+        assert result == {"output": 42}
+
+    def test_validate_task_result__none_result__wrapped_as_output(self):
+        result = evaluation_suite.validate_task_result(None)
+        assert result == {"output": None}
+
+    def test_validate_task_result__none_input_data__not_included_in_wrapper(self):
+        result = evaluation_suite.validate_task_result("response", input_data=None)
+        assert result == {"output": "response"}
+        assert "input" not in result
+
+    def test_validate_task_result__list_result_with_input_data__wrapped_correctly(self):
+        result = evaluation_suite.validate_task_result(
+            ["a", "b"], input_data={"prompt": "list me things"}
+        )
+        assert result == {"input": {"prompt": "list me things"}, "output": ["a", "b"]}
 
 
 class TestInternalRunOptimizationSuite:
Original file line number	Diff line number	Diff line change
		@@ -1 +1,2 @@
		anthropic>=0.88.0 # AttributeError: 'OpenAICompletion' object has no attribute 'client'
		crewai[anthropic, bedrock, google-genai]>=1.0.0