Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions experiments/eval/sample_eval_systems.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
from autogen_core.models import ChatCompletionClient
from systems import MagenticUIAutonomousSystem
from systems.magentic_one_system import MagenticOneSystem
from magentic_ui.eval.benchmarks import WebVoyagerBenchmark
import os

def test_magentic_ui_system():
default_client_config = {
"provider": "OpenAIChatCompletionClient",
"config": {
"model": "gpt-4o-2024-08-06",
},
"max_retries": 10,
}

system = MagenticUIAutonomousSystem(
endpoint_config_orch=default_client_config,
endpoint_config_websurfer=default_client_config,
endpoint_config_coder=default_client_config,
endpoint_config_file_surfer=default_client_config,
use_local_browser=True,
web_surfer_only=True,
)

client = ChatCompletionClient.load_component(default_client_config)

benchmark = WebVoyagerBenchmark(
data_dir="WebVoyager",
eval_method="gpt_eval",
model_client=client,
)
benchmark.download_dataset()
benchmark.load_dataset()
test_task = benchmark.tasks["Allrecipes--0"]
print(test_task)
os.makedirs("test_output_magentic_ui", exist_ok=True)
answer = system.get_answer(
task_id="Allrecipes--0",
task=test_task,
output_dir="test_output_magentic_ui",
)
print(answer)
score = benchmark.evaluator(test_task, answer)
print(score)


def test_magentic_one_system():
default_client_config = {
"provider": "OpenAIChatCompletionClient",
"config": {
"model": "gpt-4o-2024-08-06",
},
"max_retries": 10,
}

system = MagenticOneSystem(
model_client_config=default_client_config,
web_surfer_only=True,
)

client = ChatCompletionClient.load_component(default_client_config)

benchmark = WebVoyagerBenchmark(
data_dir="WebVoyager",
eval_method="gpt_eval",
model_client=client,
)
benchmark.download_dataset()
benchmark.load_dataset()
test_task = benchmark.tasks["Allrecipes--0"]
print(test_task)
os.makedirs("test_output_magentic_one", exist_ok=True)
answer = system.get_answer(
task_id="Allrecipes--0",
task=test_task,
output_dir="test_output_magentic_one",
)
print(answer)
score = benchmark.evaluator(test_task, answer)
print(score)


if __name__ == "__main__":
test_magentic_one_system()
3 changes: 2 additions & 1 deletion experiments/eval/systems/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .magentic_ui_sim_user_system import MagenticUISimUserSystem
from .magentic_ui_system import MagenticUIAutonomousSystem
from .magentic_one_system import MagenticOneSystem

__all__ = ["MagenticUISimUserSystem", "MagenticUIAutonomousSystem"]
__all__ = ["MagenticUISimUserSystem", "MagenticUIAutonomousSystem", "MagententicOneSystem"]
240 changes: 240 additions & 0 deletions experiments/eval/systems/magentic_one_system.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
import asyncio
import json
import os
import aiofiles
import logging
import datetime
from PIL import Image
from pydantic import BaseModel
from typing import List, Dict, Any, Tuple
from autogen_core.models import ChatCompletionClient
from autogen_core import Image as AGImage
from autogen_agentchat.base import TaskResult, ChatAgent
from autogen_agentchat.messages import (
MultiModalMessage,
TextMessage,
)

from autogen_ext.agents.file_surfer import FileSurfer
from autogen_ext.agents.web_surfer import MultimodalWebSurfer
from autogen_ext.agents.magentic_one import MagenticOneCoderAgent
from autogen_ext.code_executors.local import LocalCommandLineCodeExecutor
from autogen_agentchat.agents import CodeExecutorAgent
from autogen_agentchat.teams import MagenticOneGroupChat
from magentic_ui.eval.basesystem import BaseSystem
from magentic_ui.eval.models import BaseTask, BaseCandidate, WebVoyagerCandidate
from magentic_ui.types import CheckpointEvent

logger = logging.getLogger(__name__)
logging.getLogger("autogen").setLevel(logging.WARNING)
logging.getLogger("autogen.agentchat").setLevel(logging.WARNING)
logging.getLogger("autogen_agentchat.events").setLevel(logging.WARNING)


class LogEventSystem(BaseModel):
"""
Data model for logging events.

Attributes:
source (str): The source of the event (e.g., agent name).
content (str): The content/message of the event.
timestamp (str): ISO-formatted timestamp of the event.
metadata (Dict[str, str]): Additional metadata for the event.
"""

source: str
content: str
timestamp: str
metadata: Dict[str, str] = {}


class MagenticOneSystem(BaseSystem):
"""
MagenticOneSystem

Args:
name (str): Name of the system instance.
model_client_config (Dict[str, Any]): Model client config.
web_surfer_only (bool): If True, only the web surfer agent is used.
dataset_name (str): Name of the evaluation dataset (e.g., "Gaia").
"""

def __init__(
self,
model_client_config: Dict[str, Any],
web_surfer_only: bool = False,
name: str = "MagenticOneSystem",
dataset_name: str = "Gaia",
):
super().__init__(name)
self.candidate_class = WebVoyagerCandidate
self.model_client_config = model_client_config
self.dataset_name = dataset_name
self.web_surfer_only = web_surfer_only

def get_answer(
self, task_id: str, task: BaseTask, output_dir: str
) -> BaseCandidate:
"""
Runs the agent team to solve a given task and saves the answer and logs to disk.

Args:
task_id (str): Unique identifier for the task.
task (BaseTask): The task object containing the question and metadata.
output_dir (str): Directory to save logs, screenshots, and answer files.

Returns:
BaseCandidate: An object containing the final answer and any screenshots taken during execution.
"""

async def _runner() -> Tuple[str, List[str]]:
"""
Asynchronous runner that executes the agent team and collects the answer and screenshots.

Returns:
Tuple[str, List[str]]: The final answer string and a list of screenshot file paths.
"""
messages_so_far: List[LogEventSystem] = []

task_question: str = task.question
# Adapted from MagenticOne. Minor change is to allow an explanation of the final answer before the final answer.
FINAL_ANSWER_PROMPT = """
output a FINAL ANSWER to the task.
The task is: {task}`

To output the final answer, use the following template: [any explanation for final answer] FINAL ANSWER: [YOUR FINAL ANSWER]
Don't put your answer in brackets or quotes.
Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
You must answer the question and provide a smart guess if you are unsure. Provide a guess even if you have no idea about the answer.
"""

model_client = ChatCompletionClient.load_component(self.model_client_config)

# Instantiate agents explicitly
ws = MultimodalWebSurfer(
"WebSurfer",
model_client=model_client,
to_save_screenshots=True,
debug_dir=output_dir,
)

agents: List[ChatAgent] = []
if self.web_surfer_only:
agents = [ws]
else:
coder = MagenticOneCoderAgent("Coder", model_client=model_client)
executor = CodeExecutorAgent(
"ComputerTerminal", code_executor=LocalCommandLineCodeExecutor()
)
fs = FileSurfer("FileSurfer", model_client=model_client)

agents = [fs, ws, coder, executor]
m1_agent = MagenticOneGroupChat(
agents,
model_client=model_client,
final_answer_prompt=FINAL_ANSWER_PROMPT,
)

# Step 3: Prepare the task message
answer: str = ""
# check if file name is an image if it exists
if (
hasattr(task, "file_name")
and task.file_name
and task.file_name.endswith((".png", ".jpg", ".jpeg"))
):
task_message = MultiModalMessage(
content=[
task_question,
AGImage.from_pil(Image.open(task.file_name)),
],
source="user",
)
else:
task_message = TextMessage(content=task_question, source="user")
# Step 4: Run the team on the task
async for message in m1_agent.run_stream(task=task_message):
# Store log events
message_str: str = ""
try:
if isinstance(message, TaskResult) or isinstance(
message, CheckpointEvent
):
continue
message_str = message.to_text()
# Create log event with source, content and timestamp
log_event = LogEventSystem(
source=message.source,
content=message_str,
timestamp=datetime.datetime.now().isoformat(),
metadata=message.metadata,
)
messages_so_far.append(log_event)
except Exception as e:
logger.info(
f"[likely nothing] When creating model_dump of message encountered exception {e}"
)
pass

# save to file
logger.info(f"Run in progress: {task_id}, message: {message_str}")
async with aiofiles.open(
f"{output_dir}/{task_id}_messages.json", "w"
) as f:
# Convert list of logevent objects to list of dicts
messages_json = [msg.model_dump() for msg in messages_so_far]
await f.write(json.dumps(messages_json, indent=2))
# how the final answer is formatted: "Final Answer: FINAL ANSWER: Actual final answer"

# get last message with source MagenticOneOrchestrator, might not be the last message
last_message_with_orchestrator = None
for message in messages_so_far:
if message.source == "MagenticOneOrchestrator":
last_message_with_orchestrator = message
if last_message_with_orchestrator:
answer = last_message_with_orchestrator.content
answer = answer.split("FINAL ANSWER:")[0].strip()
else:
answer = messages_so_far[-1].content

assert isinstance(
answer, str
), f"Expected answer to be a string, got {type(answer)}"

# save the usage of each of the client in a usage json file
def get_usage(model_client: ChatCompletionClient) -> Dict[str, int]:
return {
"prompt_tokens": model_client.total_usage().prompt_tokens,
"completion_tokens": model_client.total_usage().completion_tokens,
}

usage_json = {
"client": get_usage(model_client),
}
with open(f"{output_dir}/model_tokens_usage.json", "w") as f:
json.dump(usage_json, f)

# Step 5: Prepare the screenshots
screenshots_paths = []
# check the directory for screenshots which start with screenshot_raw_
for file in os.listdir(output_dir):
if file.startswith("screenshot_"):
timestamp = file.split("_")[1]
screenshots_paths.append(
[timestamp, os.path.join(output_dir, file)]
)

# restrict to last 15 screenshots by timestamp
screenshots_paths = sorted(screenshots_paths, key=lambda x: x[0])[-15:]
screenshots_paths = [x[1] for x in screenshots_paths]
return answer, screenshots_paths

# Step 6: Return the answer and screenshots
answer, screenshots_paths = asyncio.run(_runner())
answer = WebVoyagerCandidate(answer=answer, screenshots=screenshots_paths)
self.save_answer_to_disk(task_id, answer, output_dir)
return answer
Loading