microsoft · husseinmozannar · Jun 18, 2025 · Jun 17, 2025 · Jun 17, 2025 · Jun 18, 2025
diff --git a/experiments/eval/sample_eval_systems.py b/experiments/eval/sample_eval_systems.py
@@ -0,0 +1,84 @@
+from autogen_core.models import ChatCompletionClient
+from systems import MagenticUIAutonomousSystem
+from systems.magentic_one_system import MagenticOneSystem
+from magentic_ui.eval.benchmarks import WebVoyagerBenchmark
+import os
+
+def test_magentic_ui_system():
+    default_client_config = {
+        "provider": "OpenAIChatCompletionClient",
+        "config": {
+            "model": "gpt-4o-2024-08-06",
+        },
+        "max_retries": 10,
+    }
+
+    system = MagenticUIAutonomousSystem(
+        endpoint_config_orch=default_client_config,
+        endpoint_config_websurfer=default_client_config,
+        endpoint_config_coder=default_client_config,
+        endpoint_config_file_surfer=default_client_config,
+        use_local_browser=True,
+        web_surfer_only=True,
+    )
+
+    client = ChatCompletionClient.load_component(default_client_config)
+
+    benchmark = WebVoyagerBenchmark(
+        data_dir="WebVoyager",
+        eval_method="gpt_eval",
+        model_client=client,
+    )
+    benchmark.download_dataset()
+    benchmark.load_dataset()
+    test_task = benchmark.tasks["Allrecipes--0"]
+    print(test_task)
+    os.makedirs("test_output_magentic_ui", exist_ok=True)
+    answer = system.get_answer(
+        task_id="Allrecipes--0",
+        task=test_task,
+        output_dir="test_output_magentic_ui",
+    )
+    print(answer)
+    score = benchmark.evaluator(test_task, answer)
+    print(score)
+
+
+def test_magentic_one_system():
+    default_client_config = {
+        "provider": "OpenAIChatCompletionClient",
+        "config": {
+            "model": "gpt-4o-2024-08-06",
+        },
+        "max_retries": 10,
+    }
+
+    system = MagenticOneSystem(
+        model_client_config=default_client_config,
+        web_surfer_only=True,
+    )
+
+    client = ChatCompletionClient.load_component(default_client_config)
+
+    benchmark = WebVoyagerBenchmark(
+        data_dir="WebVoyager",
+        eval_method="gpt_eval",
+        model_client=client,
+    )
+    benchmark.download_dataset()
+    benchmark.load_dataset()
+    test_task = benchmark.tasks["Allrecipes--0"]
+    print(test_task)
+    os.makedirs("test_output_magentic_one", exist_ok=True)
+    answer = system.get_answer(
+        task_id="Allrecipes--0",
+        task=test_task,
+        output_dir="test_output_magentic_one",
+    )
+    print(answer)
+    score = benchmark.evaluator(test_task, answer)
+    print(score)
+
+
+if __name__ == "__main__":
+    test_magentic_one_system()
diff --git a/experiments/eval/systems/__init__.py b/experiments/eval/systems/__init__.py
@@ -1,4 +1,5 @@
 from .magentic_ui_sim_user_system import MagenticUISimUserSystem
 from .magentic_ui_system import MagenticUIAutonomousSystem
+from .magentic_one_system import MagenticOneSystem
 
-__all__ = ["MagenticUISimUserSystem", "MagenticUIAutonomousSystem"]
+__all__ = ["MagenticUISimUserSystem", "MagenticUIAutonomousSystem", "MagententicOneSystem"]
diff --git a/experiments/eval/systems/magentic_one_system.py b/experiments/eval/systems/magentic_one_system.py
@@ -0,0 +1,240 @@
+import asyncio
+import json
+import os
+import aiofiles
+import logging
+import datetime
+from PIL import Image
+from pydantic import BaseModel
+from typing import List, Dict, Any, Tuple
+from autogen_core.models import ChatCompletionClient
+from autogen_core import Image as AGImage
+from autogen_agentchat.base import TaskResult, ChatAgent
+from autogen_agentchat.messages import (
+    MultiModalMessage,
+    TextMessage,
+)
+
+from autogen_ext.agents.file_surfer import FileSurfer
+from autogen_ext.agents.web_surfer import MultimodalWebSurfer
+from autogen_ext.agents.magentic_one import MagenticOneCoderAgent
+from autogen_ext.code_executors.local import LocalCommandLineCodeExecutor
+from autogen_agentchat.agents import CodeExecutorAgent
+from autogen_agentchat.teams import MagenticOneGroupChat
+from magentic_ui.eval.basesystem import BaseSystem
+from magentic_ui.eval.models import BaseTask, BaseCandidate, WebVoyagerCandidate
+from magentic_ui.types import CheckpointEvent
+
+logger = logging.getLogger(__name__)
+logging.getLogger("autogen").setLevel(logging.WARNING)
+logging.getLogger("autogen.agentchat").setLevel(logging.WARNING)
+logging.getLogger("autogen_agentchat.events").setLevel(logging.WARNING)
+
+
+class LogEventSystem(BaseModel):
+    """
+    Data model for logging events.
+
+    Attributes:
+        source (str): The source of the event (e.g., agent name).
+        content (str): The content/message of the event.
+        timestamp (str): ISO-formatted timestamp of the event.
+        metadata (Dict[str, str]): Additional metadata for the event.
+    """
+
+    source: str
+    content: str
+    timestamp: str
+    metadata: Dict[str, str] = {}
+
+
+class MagenticOneSystem(BaseSystem):
+    """
+    MagenticOneSystem
+
+    Args:
+        name (str): Name of the system instance.
+        model_client_config (Dict[str, Any]): Model client config.
+        web_surfer_only (bool): If True, only the web surfer agent is used.
+        dataset_name (str): Name of the evaluation dataset (e.g., "Gaia").
+    """
+
+    def __init__(
+        self,
+        model_client_config: Dict[str, Any],
+        web_surfer_only: bool = False,
+        name: str = "MagenticOneSystem",
+        dataset_name: str = "Gaia",
+    ):
+        super().__init__(name)
+        self.candidate_class = WebVoyagerCandidate
+        self.model_client_config = model_client_config
+        self.dataset_name = dataset_name
+        self.web_surfer_only = web_surfer_only
+
+    def get_answer(
+        self, task_id: str, task: BaseTask, output_dir: str
+    ) -> BaseCandidate:
+        """
+        Runs the agent team to solve a given task and saves the answer and logs to disk.
+
+        Args:
+            task_id (str): Unique identifier for the task.
+            task (BaseTask): The task object containing the question and metadata.
+            output_dir (str): Directory to save logs, screenshots, and answer files.
+
+        Returns:
+            BaseCandidate: An object containing the final answer and any screenshots taken during execution.
+        """
+
+        async def _runner() -> Tuple[str, List[str]]:
+            """
+            Asynchronous runner that executes the agent team and collects the answer and screenshots.
+
+            Returns:
+                Tuple[str, List[str]]: The final answer string and a list of screenshot file paths.
+            """
+            messages_so_far: List[LogEventSystem] = []
+
+            task_question: str = task.question
+            # Adapted from MagenticOne. Minor change is to allow an explanation of the final answer before the final answer.
+            FINAL_ANSWER_PROMPT = """
+            output a FINAL ANSWER to the task.
+            The task is: {task}`
+
+            To output the final answer, use the following template: [any explanation for final answer] FINAL ANSWER: [YOUR FINAL ANSWER]
+            Don't put your answer in brackets or quotes. 
+            Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
+            ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
+            If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
+            If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
+            If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
+            You must answer the question and provide a smart guess if you are unsure. Provide a guess even if you have no idea about the answer.
+            """
+
+            model_client = ChatCompletionClient.load_component(self.model_client_config)
+
+            # Instantiate agents explicitly
+            ws = MultimodalWebSurfer(
+                "WebSurfer",
+                model_client=model_client,
+                to_save_screenshots=True,
+                debug_dir=output_dir,
+            )
+
+            agents: List[ChatAgent] = []
+            if self.web_surfer_only:
+                agents = [ws]
+            else:
+                coder = MagenticOneCoderAgent("Coder", model_client=model_client)
+                executor = CodeExecutorAgent(
+                    "ComputerTerminal", code_executor=LocalCommandLineCodeExecutor()
+                )
+                fs = FileSurfer("FileSurfer", model_client=model_client)
+
+                agents = [fs, ws, coder, executor]
+            m1_agent = MagenticOneGroupChat(
+                agents,
+                model_client=model_client,
+                final_answer_prompt=FINAL_ANSWER_PROMPT,
+            )
+
+            # Step 3: Prepare the task message
+            answer: str = ""
+            # check if file name is an image if it exists
+            if (
+                hasattr(task, "file_name")
+                and task.file_name
+                and task.file_name.endswith((".png", ".jpg", ".jpeg"))
+            ):
+                task_message = MultiModalMessage(
+                    content=[
+                        task_question,
+                        AGImage.from_pil(Image.open(task.file_name)),
+                    ],
+                    source="user",
+                )
+            else:
+                task_message = TextMessage(content=task_question, source="user")
+            # Step 4: Run the team on the task
+            async for message in m1_agent.run_stream(task=task_message):
+                # Store log events
+                message_str: str = ""
+                try:
+                    if isinstance(message, TaskResult) or isinstance(
+                        message, CheckpointEvent
+                    ):
+                        continue
+                    message_str = message.to_text()
+                    # Create log event with source, content and timestamp
+                    log_event = LogEventSystem(
+                        source=message.source,
+                        content=message_str,
+                        timestamp=datetime.datetime.now().isoformat(),
+                        metadata=message.metadata,
+                    )
+                    messages_so_far.append(log_event)
+                except Exception as e:
+                    logger.info(
+                        f"[likely nothing] When creating model_dump of message encountered exception {e}"
+                    )
+                    pass
+
+                # save to file
+                logger.info(f"Run in progress: {task_id}, message: {message_str}")
+                async with aiofiles.open(
+                    f"{output_dir}/{task_id}_messages.json", "w"
+                ) as f:
+                    # Convert list of logevent objects to list of dicts
+                    messages_json = [msg.model_dump() for msg in messages_so_far]
+                    await f.write(json.dumps(messages_json, indent=2))
+                # how the final answer is formatted:  "Final Answer: FINAL ANSWER: Actual final answer"
+
+            # get last message with source MagenticOneOrchestrator, might not be the last message
+            last_message_with_orchestrator = None
+            for message in messages_so_far:
+                if message.source == "MagenticOneOrchestrator":
+                    last_message_with_orchestrator = message
+            if last_message_with_orchestrator:
+                answer = last_message_with_orchestrator.content
+                answer = answer.split("FINAL ANSWER:")[0].strip()
+            else:
+                answer = messages_so_far[-1].content
+
+            assert isinstance(
+                answer, str
+            ), f"Expected answer to be a string, got {type(answer)}"
+
+            # save the usage of each of the client in a usage json file
+            def get_usage(model_client: ChatCompletionClient) -> Dict[str, int]:
+                return {
+                    "prompt_tokens": model_client.total_usage().prompt_tokens,
+                    "completion_tokens": model_client.total_usage().completion_tokens,
+                }
+
+            usage_json = {
+                "client": get_usage(model_client),
+            }
+            with open(f"{output_dir}/model_tokens_usage.json", "w") as f:
+                json.dump(usage_json, f)
+
+            # Step 5: Prepare the screenshots
+            screenshots_paths = []
+            # check the directory for screenshots which start with screenshot_raw_
+            for file in os.listdir(output_dir):
+                if file.startswith("screenshot_"):
+                    timestamp = file.split("_")[1]
+                    screenshots_paths.append(
+                        [timestamp, os.path.join(output_dir, file)]
+                    )
+
+            # restrict to last 15 screenshots by timestamp
+            screenshots_paths = sorted(screenshots_paths, key=lambda x: x[0])[-15:]
+            screenshots_paths = [x[1] for x in screenshots_paths]
+            return answer, screenshots_paths
+
+        # Step 6: Return the answer and screenshots
+        answer, screenshots_paths = asyncio.run(_runner())
+        answer = WebVoyagerCandidate(answer=answer, screenshots=screenshots_paths)
+        self.save_answer_to_disk(task_id, answer, output_dir)
+        return answer