diff --git a/sdks/python/src/opik/api_objects/dataset/evaluation_suite/evaluation_suite.py b/sdks/python/src/opik/api_objects/dataset/evaluation_suite/evaluation_suite.py index 7b9d879f3b8..df0df76ba2a 100644 --- a/sdks/python/src/opik/api_objects/dataset/evaluation_suite/evaluation_suite.py +++ b/sdks/python/src/opik/api_objects/dataset/evaluation_suite/evaluation_suite.py @@ -26,22 +26,46 @@ LLMTask = Callable[[Dict[str, Any]], Any] -def validate_task_result(result: Any) -> Dict[str, Any]: - if not isinstance(result, dict): - raise TypeError( - f"The task function must return a dict with 'input' and " - f"'output' keys, but it returned {type(result).__name__}. " - f"Example: return {{'input': data, 'output': response}}" - ) - missing = {"input", "output"} - result.keys() - if missing: - raise ValueError( - f"The task function must return a dict with 'input' and " - f"'output' keys, but the returned dict is missing: " - f"{missing}. Got keys: {set(result.keys())}. " - f"Example: return {{'input': data, 'output': response}}" - ) - return result +def validate_task_result( + result: Any, + input_data: Any = None, +) -> Dict[str, Any]: + """Normalise the value returned by a task function into a result dict. + + If *result* is already a :class:`dict`, it is returned as-is (the + supported keys are ``"input"`` and ``"output"``). + + For any other type the value is wrapped automatically:: + + {"output": result} + + When *input_data* is also provided the wrapper becomes:: + + {"input": input_data, "output": result} + + Args: + result: Value returned by the task callable. + input_data: Optional input that was passed to the task. Included in + the wrapper dict as ``"input"`` when *result* is not a dict. + + Returns: + A dict suitable for use as an experiment trace result. + """ + if isinstance(result, dict): + missing = {"input", "output"} - result.keys() + if missing: + raise ValueError( + f"The task function must return a dict with 'input' and " + f"'output' keys, but the returned dict is missing: " + f"{missing}. Got keys: {set(result.keys())}. " + f"Example: return {{'input': data, 'output': response}}" + ) + return result + + wrapped: Dict[str, Any] = {"output": result} + if input_data is not None: + wrapped["input"] = input_data + return wrapped class EvaluationSuite: @@ -408,11 +432,14 @@ def run( Run the evaluation suite against a task function. The task function receives each test item's data dict and must return - a dict with "input" and "output" keys. + either a dict (with ``"input"`` and ``"output"`` as the supported keys) + or any other value, which will be automatically wrapped as + ``{"input": , "output": }``. Args: task: A callable that takes a dict (the item's data) and returns - a dict with "input" and "output" keys. + a dict with ``"input"``/``"output"`` keys, or any other value + to be auto-wrapped. experiment_name_prefix: Optional prefix for auto-generated experiment name. experiment_name: Optional explicit name for the experiment. project_name: Optional project name for tracking. @@ -434,7 +461,12 @@ def run( EvaluationSuiteResult with pass/fail status based on execution policy. Example: - >>> def my_llm_task(data: dict) -> dict: + >>> # Simplified: return any value it is auto-wrapped internally + >>> def my_llm_task(data: dict) -> str: + ... return call_my_llm(data["user_input"], user_tier=data.get("user_tier")) + >>> + >>> # Explicit: return a dict with "input" and "output" keys directly + >>> def my_llm_task_explicit(data: dict) -> dict: ... response = call_my_llm(data["user_input"], user_tier=data.get("user_tier")) ... return {"input": data, "output": response} >>> @@ -489,7 +521,8 @@ def __internal_api__run_optimization_suite__( Args: task: A callable that takes a dict (the item's data) and returns - a dict with "input" and "output" keys. + a dict with ``"input"``/``"output"`` keys, or any other value + to be auto-wrapped. experiment_name_prefix: Optional prefix for auto-generated experiment name. experiment_name: Optional explicit name for the experiment. project_name: Optional project name for tracking. @@ -517,7 +550,7 @@ def __internal_api__run_optimization_suite__( @functools.wraps(task) def _validated_task(data: Dict[str, Any]) -> Any: - return validate_task_result(task(data)) + return validate_task_result(task(data), input_data=data) suite_result = opik_evaluator.evaluate_suite( dataset=self._dataset, diff --git a/sdks/python/tests/library_integration/crewai/requirements_v1.txt b/sdks/python/tests/library_integration/crewai/requirements_v1.txt index c283e2c58ad..7b5e900d35f 100644 --- a/sdks/python/tests/library_integration/crewai/requirements_v1.txt +++ b/sdks/python/tests/library_integration/crewai/requirements_v1.txt @@ -1 +1,2 @@ +anthropic>=0.88.0 # AttributeError: 'OpenAICompletion' object has no attribute 'client' crewai[anthropic, bedrock, google-genai]>=1.0.0 diff --git a/sdks/python/tests/unit/api_objects/dataset/evaluation_suite/test_evaluation_suite.py b/sdks/python/tests/unit/api_objects/dataset/evaluation_suite/test_evaluation_suite.py index c5f58988b60..a7f903b21d6 100644 --- a/sdks/python/tests/unit/api_objects/dataset/evaluation_suite/test_evaluation_suite.py +++ b/sdks/python/tests/unit/api_objects/dataset/evaluation_suite/test_evaluation_suite.py @@ -623,25 +623,58 @@ def test_build_suite_result__integer_score_zero__run_fails(self): class TestValidateTaskResult: - def test_non_dict__raises_type_error(self): - with pytest.raises( - TypeError, match="must return a dict with 'input' and 'output'" - ): - evaluation_suite.validate_task_result("just a string") + # --- dict results --- + + def test_validate_task_result__valid_dict__returns_as_is(self): + result = evaluation_suite.validate_task_result( + {"input": "hello", "output": "world"} + ) + assert result == {"input": "hello", "output": "world"} - def test_missing_keys__raises_value_error(self): + def test_validate_task_result__dict_missing_output__raises_value_error(self): with pytest.raises(ValueError, match="missing.*output"): evaluation_suite.validate_task_result({"input": "hello"}) - def test_missing_both_keys__raises_value_error(self): + def test_validate_task_result__dict_missing_both_keys__raises_value_error(self): with pytest.raises(ValueError, match="missing"): evaluation_suite.validate_task_result({"response": "hello"}) - def test_valid_result__returns_dict(self): + def test_validate_task_result__dict_missing_input__raises_value_error(self): + with pytest.raises(ValueError, match="missing.*input"): + evaluation_suite.validate_task_result({"output": "world"}) + + # --- non-dict results: auto-wrapping --- + + def test_validate_task_result__string_result__wrapped_as_output(self): + result = evaluation_suite.validate_task_result("just a string") + assert result == {"output": "just a string"} + + def test_validate_task_result__string_result_with_input_data__wrapped_with_input( + self, + ): result = evaluation_suite.validate_task_result( - {"input": "hello", "output": "world"} + "just a string", input_data={"question": "hi"} ) - assert result == {"input": "hello", "output": "world"} + assert result == {"input": {"question": "hi"}, "output": "just a string"} + + def test_validate_task_result__int_result__wrapped_as_output(self): + result = evaluation_suite.validate_task_result(42) + assert result == {"output": 42} + + def test_validate_task_result__none_result__wrapped_as_output(self): + result = evaluation_suite.validate_task_result(None) + assert result == {"output": None} + + def test_validate_task_result__none_input_data__not_included_in_wrapper(self): + result = evaluation_suite.validate_task_result("response", input_data=None) + assert result == {"output": "response"} + assert "input" not in result + + def test_validate_task_result__list_result_with_input_data__wrapped_correctly(self): + result = evaluation_suite.validate_task_result( + ["a", "b"], input_data={"prompt": "list me things"} + ) + assert result == {"input": {"prompt": "list me things"}, "output": ["a", "b"]} class TestInternalRunOptimizationSuite: