Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,22 +26,46 @@
LLMTask = Callable[[Dict[str, Any]], Any]


def validate_task_result(result: Any) -> Dict[str, Any]:
if not isinstance(result, dict):
raise TypeError(
f"The task function must return a dict with 'input' and "
f"'output' keys, but it returned {type(result).__name__}. "
f"Example: return {{'input': data, 'output': response}}"
)
missing = {"input", "output"} - result.keys()
if missing:
raise ValueError(
f"The task function must return a dict with 'input' and "
f"'output' keys, but the returned dict is missing: "
f"{missing}. Got keys: {set(result.keys())}. "
f"Example: return {{'input': data, 'output': response}}"
)
return result
def validate_task_result(
result: Any,
input_data: Any = None,
) -> Dict[str, Any]:
"""Normalise the value returned by a task function into a result dict.

If *result* is already a :class:`dict`, it is returned as-is (the
supported keys are ``"input"`` and ``"output"``).

For any other type the value is wrapped automatically::

{"output": result}

When *input_data* is also provided the wrapper becomes::

{"input": input_data, "output": result}

Args:
result: Value returned by the task callable.
input_data: Optional input that was passed to the task. Included in
the wrapper dict as ``"input"`` when *result* is not a dict.

Returns:
A dict suitable for use as an experiment trace result.
"""
if isinstance(result, dict):
missing = {"input", "output"} - result.keys()
if missing:
raise ValueError(
f"The task function must return a dict with 'input' and "
f"'output' keys, but the returned dict is missing: "
f"{missing}. Got keys: {set(result.keys())}. "
f"Example: return {{'input': data, 'output': response}}"
)
return result

wrapped: Dict[str, Any] = {"output": result}
if input_data is not None:
wrapped["input"] = input_data
return wrapped


class EvaluationSuite:
Expand Down Expand Up @@ -408,11 +432,14 @@ def run(
Run the evaluation suite against a task function.

The task function receives each test item's data dict and must return
a dict with "input" and "output" keys.
either a dict (with ``"input"`` and ``"output"`` as the supported keys)
or any other value, which will be automatically wrapped as
``{"input": <item data>, "output": <returned value>}``.

Args:
task: A callable that takes a dict (the item's data) and returns
a dict with "input" and "output" keys.
a dict with ``"input"``/``"output"`` keys, or any other value
to be auto-wrapped.
experiment_name_prefix: Optional prefix for auto-generated experiment name.
experiment_name: Optional explicit name for the experiment.
project_name: Optional project name for tracking.
Expand All @@ -434,7 +461,12 @@ def run(
EvaluationSuiteResult with pass/fail status based on execution policy.

Example:
>>> def my_llm_task(data: dict) -> dict:
>>> # Simplified: return any value it is auto-wrapped internally
>>> def my_llm_task(data: dict) -> str:
... return call_my_llm(data["user_input"], user_tier=data.get("user_tier"))
>>>
>>> # Explicit: return a dict with "input" and "output" keys directly
>>> def my_llm_task_explicit(data: dict) -> dict:
... response = call_my_llm(data["user_input"], user_tier=data.get("user_tier"))
... return {"input": data, "output": response}
>>>
Expand Down Expand Up @@ -489,7 +521,8 @@ def __internal_api__run_optimization_suite__(

Args:
task: A callable that takes a dict (the item's data) and returns
a dict with "input" and "output" keys.
a dict with ``"input"``/``"output"`` keys, or any other value
to be auto-wrapped.
experiment_name_prefix: Optional prefix for auto-generated experiment name.
experiment_name: Optional explicit name for the experiment.
project_name: Optional project name for tracking.
Expand Down Expand Up @@ -517,7 +550,7 @@ def __internal_api__run_optimization_suite__(

@functools.wraps(task)
def _validated_task(data: Dict[str, Any]) -> Any:
return validate_task_result(task(data))
return validate_task_result(task(data), input_data=data)

suite_result = opik_evaluator.evaluate_suite(
dataset=self._dataset,
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
anthropic>=0.88.0 # AttributeError: 'OpenAICompletion' object has no attribute 'client'
crewai[anthropic, bedrock, google-genai]>=1.0.0
Original file line number Diff line number Diff line change
Expand Up @@ -623,25 +623,58 @@ def test_build_suite_result__integer_score_zero__run_fails(self):


class TestValidateTaskResult:
def test_non_dict__raises_type_error(self):
with pytest.raises(
TypeError, match="must return a dict with 'input' and 'output'"
):
evaluation_suite.validate_task_result("just a string")
# --- dict results ---

def test_validate_task_result__valid_dict__returns_as_is(self):
result = evaluation_suite.validate_task_result(
{"input": "hello", "output": "world"}
)
assert result == {"input": "hello", "output": "world"}

def test_missing_keys__raises_value_error(self):
def test_validate_task_result__dict_missing_output__raises_value_error(self):
with pytest.raises(ValueError, match="missing.*output"):
evaluation_suite.validate_task_result({"input": "hello"})

def test_missing_both_keys__raises_value_error(self):
def test_validate_task_result__dict_missing_both_keys__raises_value_error(self):
with pytest.raises(ValueError, match="missing"):
evaluation_suite.validate_task_result({"response": "hello"})

def test_valid_result__returns_dict(self):
def test_validate_task_result__dict_missing_input__raises_value_error(self):
with pytest.raises(ValueError, match="missing.*input"):
evaluation_suite.validate_task_result({"output": "world"})

# --- non-dict results: auto-wrapping ---

def test_validate_task_result__string_result__wrapped_as_output(self):
result = evaluation_suite.validate_task_result("just a string")
assert result == {"output": "just a string"}

def test_validate_task_result__string_result_with_input_data__wrapped_with_input(
self,
):
result = evaluation_suite.validate_task_result(
{"input": "hello", "output": "world"}
"just a string", input_data={"question": "hi"}
)
assert result == {"input": "hello", "output": "world"}
assert result == {"input": {"question": "hi"}, "output": "just a string"}

def test_validate_task_result__int_result__wrapped_as_output(self):
result = evaluation_suite.validate_task_result(42)
assert result == {"output": 42}

def test_validate_task_result__none_result__wrapped_as_output(self):
result = evaluation_suite.validate_task_result(None)
assert result == {"output": None}

def test_validate_task_result__none_input_data__not_included_in_wrapper(self):
result = evaluation_suite.validate_task_result("response", input_data=None)
assert result == {"output": "response"}
assert "input" not in result

def test_validate_task_result__list_result_with_input_data__wrapped_correctly(self):
result = evaluation_suite.validate_task_result(
["a", "b"], input_data={"prompt": "list me things"}
)
assert result == {"input": {"prompt": "list me things"}, "output": ["a", "b"]}


class TestInternalRunOptimizationSuite:
Expand Down
Loading