Add skill marketplace and refresh README

black-yt · black-yt · commit 500d8340179d · 2026-04-14T08:07:50.000+08:00
diff --git a/README.md b/README.md
diff --git a/docs/architecture.md b/docs/architecture.md
@@ -241,6 +241,7 @@ class Operator(Protocol):
 | `state_view` | the state slice visible to this node |
 | `inbox` | incoming A2A messages |
 | `artifacts` | visible artifacts |
+| `skills` | skill names loaded for this operator execution |
 | `working_dir` | working directory |
 | `session_policy` | whether to create a new session, reuse one, or force resume |
 | `tool_policy` | allowed tools and permission level |
diff --git a/examples/planner_coder_reviewer.py b/examples/planner_coder_reviewer.py
@@ -29,12 +29,16 @@ def planner_script(_request):
 
 
 def coder_script(request):
+    payload = json.loads(request.instruction)
     return [
         ControllerEvent(
             kind="message_completed",
             payload={
                 "kind": "observation",
-                "text": f"Coder received {len(request.metadata)} metadata keys and produced a draft.",
+                "text": (
+                    f"Coder received {len(payload['skills'])} skills "
+                    f"and {len(request.metadata)} metadata keys and produced a draft."
+                ),
             },
         ),
         ControllerEvent(
@@ -84,9 +88,24 @@ def main() -> None:
     graph.add_operator("coder_op", DefaultOperator("coder_op", StaticController(coder_script), role="coder"))
     graph.add_operator("reviewer_op", DefaultOperator("reviewer_op", StaticController(reviewer_script), role="reviewer"))
 
-    graph.add_node("plan", operator="planner_op", objective="Plan the work")
-    graph.add_node("implement", operator="coder_op", objective="Implement the work")
-    graph.add_node("review", operator="reviewer_op", objective="Review the work")
+    graph.add_node(
+        "plan",
+        operator="planner_op",
+        objective="Plan the work",
+        skills=["research-paper-search", "literature-synthesis"],
+    )
+    graph.add_node(
+        "implement",
+        operator="coder_op",
+        objective="Implement the work",
+        skills=["experiment-planning", "dataset-triage"],
+    )
+    graph.add_node(
+        "review",
+        operator="reviewer_op",
+        objective="Review the work",
+        skills=["citation-audit", "result-audit"],
+    )
     graph.add_edge("plan", "implement")
     graph.add_edge("implement", "review")
 
diff --git a/skills/README.md b/skills/README.md
@@ -0,0 +1,52 @@
+# AgentWorld Skills
+
+AgentWorld supports a repository-local skill marketplace under `skills/`.
+
+Each skill is a reusable package of execution guidance that can be loaded by one operator node without forcing the same behavior onto every other node in the graph.
+
+## Structure
+
+Each skill usually lives in its own folder:
+
+```text
+skills/
+└── skill-name/
+    ├── SKILL.md
+    ├── references/
+    ├── scripts/
+    └── assets/
+```
+
+Only `SKILL.md` is required. The other folders are optional.
+
+## Included Skills
+
+| Skill | Purpose |
+| --- | --- |
+| `research-paper-search` | paper discovery, identifier collection, source triage |
+| `literature-synthesis` | theme extraction, contradiction mapping, gap analysis |
+| `citation-audit` | bibliography validation, metadata checks, citation hygiene |
+| `experiment-planning` | executable research plans, deliverables, validation gates |
+| `result-audit` | output review, unsupported claim detection, evidence checks |
+
+## Usage
+
+Attach skills directly to a graph node:
+
+```python
+graph.add_node(
+    "plan",
+    operator="planner",
+    objective="Search the literature and frame the study",
+    skills=["research-paper-search", "literature-synthesis"],
+)
+```
+
+The runtime injects the selected skill names into the operator request, so the operator can build prompts and execution context for that specific node.
+
+## Notes
+
+- Skills are public project assets and should stay in English
+- Skills should be specific enough to be reusable, but not so narrow that they only fit one task
+- If a skill needs code or references, keep them inside the skill folder
+- Private working notes belong in ignored local files, not in `skills/`
diff --git a/skills/citation-audit/SKILL.md b/skills/citation-audit/SKILL.md
@@ -0,0 +1,38 @@
+---
+name: citation-audit
+description: Use when an operator must verify citation accuracy, bibliography consistency, metadata quality, and whether claims are actually supported by the cited sources.
+---
+
+# Citation Audit
+
+## When to Use This Skill
+
+Use this skill when the node needs to:
+
+- verify that citations support the written claim
+- check metadata such as title, authors, venue, year, and DOI
+- detect missing or duplicate references
+- clean up a bibliography before release
+- review whether a writeup overstates what a cited paper proves
+
+## Workflow
+
+1. Check whether every major claim has a citation.
+2. For each citation, confirm that the source actually supports the stated claim.
+3. Normalize key metadata fields.
+4. Remove duplicates and weak citations.
+5. Flag unsupported or overstated claims.
+
+## Expected Outputs
+
+- a citation issue list
+- corrected metadata suggestions
+- unsupported claim warnings
+- a cleaned bibliography checklist
+
+## Quality Rules
+
+- citation presence is not enough; support must be real
+- prefer primary papers over derivative references
+- flag ambiguous or second-hand citations
+- keep corrections explicit and actionable
diff --git a/skills/experiment-planning/SKILL.md b/skills/experiment-planning/SKILL.md
@@ -0,0 +1,38 @@
+---
+name: experiment-planning
+description: Use when an operator needs to convert a research objective into an executable plan with deliverables, checkpoints, dependencies, and validation criteria.
+---
+
+# Experiment Planning
+
+## When to Use This Skill
+
+Use this skill when the node needs to:
+
+- turn a vague research goal into a concrete plan
+- define inputs, outputs, metrics, and success criteria
+- break work into stages for planner, coder, and reviewer nodes
+- identify risky steps before execution begins
+- decide what evidence is needed to claim success
+
+## Workflow
+
+1. Rewrite the objective as a measurable question.
+2. List required inputs, tools, datasets, and dependencies.
+3. Break the task into stages with expected deliverables.
+4. Add validation gates for each stage.
+5. Mark high-risk assumptions and fallback paths.
+
+## Expected Outputs
+
+- a staged execution plan
+- a deliverable checklist
+- a risk register
+- validation checkpoints
+
+## Quality Rules
+
+- plans should be executable, not aspirational
+- every stage should produce an inspectable artifact
+- note blockers early instead of hiding them in later steps
+- define what counts as success before running the work
diff --git a/skills/literature-synthesis/SKILL.md b/skills/literature-synthesis/SKILL.md
@@ -0,0 +1,38 @@
+---
+name: literature-synthesis
+description: Use when an operator already has a paper set and needs to extract themes, claims, contradictions, methods, limitations, and open gaps.
+---
+
+# Literature Synthesis
+
+## When to Use This Skill
+
+Use this skill when the node needs to:
+
+- summarize a set of related papers
+- compare methods and assumptions
+- identify agreement and disagreement across studies
+- map limitations and open questions
+- produce a structured literature review instead of isolated summaries
+
+## Workflow
+
+1. Group papers by method, dataset, or research question.
+2. Extract the claim, method, evidence, and limitation for each paper.
+3. Merge repeated findings into themes.
+4. Highlight contradictions explicitly and explain what differs.
+5. End with a gap map: what is still untested, weakly supported, or missing.
+
+## Expected Outputs
+
+- a theme table
+- a method comparison block
+- a contradiction list
+- a research gap summary
+
+## Quality Rules
+
+- avoid turning one paper into a field-wide claim
+- always separate result from interpretation
+- keep limitations close to the claim they weaken
+- if evidence is thin, say so directly
diff --git a/skills/research-paper-search/SKILL.md b/skills/research-paper-search/SKILL.md
@@ -0,0 +1,38 @@
+---
+name: research-paper-search
+description: Use when an operator needs to find relevant papers, databases, identifiers, benchmark references, or primary-source evidence before planning or implementation.
+---
+
+# Research Paper Search
+
+## When to Use This Skill
+
+Use this skill when the node needs to:
+
+- find primary papers for a topic
+- identify canonical baselines or benchmark papers
+- collect DOI, arXiv, PMID, or project URLs
+- narrow a broad problem into a tractable evidence set
+- distinguish primary sources from commentary or summaries
+
+## Workflow
+
+1. Start from the task objective and convert it into 2-4 search queries.
+2. Prefer primary sources: papers, official datasets, benchmark repos, and technical documentation.
+3. Capture identifiers early: title, year, venue, DOI, arXiv id, project URL.
+4. Separate "must-read" papers from "background only" papers.
+5. Return a short evidence map instead of a loose list.
+
+## Expected Outputs
+
+- a ranked paper list
+- a baseline list
+- a source table with identifiers
+- unresolved search gaps that require another pass
+
+## Quality Rules
+
+- prefer primary sources over blog summaries
+- note publication year and venue whenever available
+- call out uncertainty when a result is only weakly relevant
+- do not claim a benchmark or baseline is standard without evidence
diff --git a/skills/result-audit/SKILL.md b/skills/result-audit/SKILL.md
@@ -0,0 +1,38 @@
+---
+name: result-audit
+description: Use when an operator must review outputs for unsupported conclusions, weak evidence, missing ablations, broken assumptions, or incomplete reporting.
+---
+
+# Result Audit
+
+## When to Use This Skill
+
+Use this skill when the node needs to:
+
+- review a report, notebook, or artifact bundle
+- detect unsupported conclusions
+- check whether the evidence matches the claimed contribution
+- flag missing baselines, controls, or ablations
+- judge whether the final output is ready for handoff or publication
+
+## Workflow
+
+1. Read the stated claim before the evidence.
+2. Map each important claim to the supporting artifact or result.
+3. Check for missing comparisons, controls, or caveats.
+4. Identify where the output overstates certainty.
+5. Return findings as prioritized review items.
+
+## Expected Outputs
+
+- a severity-ranked review list
+- unsupported claim flags
+- missing evidence or baseline flags
+- final release recommendation
+
+## Quality Rules
+
+- focus on evidence, not writing style
+- do not accept "looks plausible" as support
+- call out missing baselines explicitly
+- separate hard failures from optional improvements
diff --git a/src/agentworld/graph/builder.py b/src/agentworld/graph/builder.py
@@ -55,10 +55,13 @@ def add_node(
         objective: str | None = None,
         role: str | None = None,
         input_selector: StateSelector | None = None,
+        skills: Sequence[str] | None = None,
         metadata: dict[str, Any] | None = None,
     ) -> "AgentGraph":
         if name in self.nodes:
             raise ValueError(f"Node already exists: {name}")
+        resolved_metadata = dict(metadata or {})
+        resolved_skills = [str(skill) for skill in (skills or resolved_metadata.pop("skills", []))]
         self.nodes[name] = GraphNode(
             name=name,
             kind=kind,
@@ -67,7 +70,8 @@ def add_node(
             objective=objective,
             role=role,
             input_selector=input_selector,
-            metadata=dict(metadata or {}),
+            skills=resolved_skills,
+            metadata=resolved_metadata,
         )
         return self
 
diff --git a/src/agentworld/graph/node.py b/src/agentworld/graph/node.py
@@ -15,4 +15,5 @@ class GraphNode:
     objective: str | None = None
     role: str | None = None
     input_selector: StateSelector | None = None
+    skills: list[str] = field(default_factory=list)
     metadata: dict[str, Any] = field(default_factory=dict)
diff --git a/src/agentworld/operator/base.py b/src/agentworld/operator/base.py
@@ -51,7 +51,11 @@ def invoke(self, request: OperatorRequest, runtime: RuntimeContext) -> OperatorR
             instruction=self._build_instruction(request),
             tool_policy=request.tool_policy.to_dict(),
             timeout_s=request.timeout_s,
-            metadata={"graph_id": runtime.graph_id, "node_name": runtime.node_name},
+            metadata={
+                "graph_id": runtime.graph_id,
+                "node_name": runtime.node_name,
+                "skills": list(request.skills),
+            },
         )
         handle = self.controller.start(start_request)
         return self._collect_result(request, runtime, handle)
@@ -63,7 +67,11 @@ def resume(self, request: OperatorResumeRequest, runtime: RuntimeContext) -> Ope
             instruction=self._build_instruction(request),
             tool_policy=request.tool_policy.to_dict(),
             timeout_s=request.timeout_s,
-            metadata={"graph_id": runtime.graph_id, "node_name": runtime.node_name},
+            metadata={
+                "graph_id": runtime.graph_id,
+                "node_name": runtime.node_name,
+                "skills": list(request.skills),
+            },
         )
         handle = self.controller.resume(resume_request)
         return self._collect_result(request, runtime, handle)
@@ -75,6 +83,7 @@ def _build_instruction(self, request: OperatorRequest) -> str:
             "state_view": request.state_view,
             "inbox": [message.to_dict() for message in request.inbox],
             "artifacts": [artifact.to_dict() for artifact in request.artifacts],
+            "skills": list(request.skills),
             "metadata": request.metadata,
         }
         rendered = json.dumps(payload, indent=2, ensure_ascii=True, default=str)
diff --git a/src/agentworld/operator/models.py b/src/agentworld/operator/models.py
@@ -59,6 +59,7 @@ class OperatorRequest:
     state_view: Mapping[str, Any] = field(default_factory=dict)
     inbox: list[A2AEnvelope] = field(default_factory=list)
     artifacts: list[Artifact] = field(default_factory=list)
+    skills: list[str] = field(default_factory=list)
     working_dir: Path | None = None
     session_policy: SessionPolicy = "new"
     tool_policy: ToolPolicy = field(default_factory=ToolPolicy)
diff --git a/src/agentworld/runtime/executor.py b/src/agentworld/runtime/executor.py
@@ -77,13 +77,18 @@ def invoke(
                 )
             else:
                 tool_policy = ToolPolicy()
+            node_skills = [str(skill) for skill in node.skills]
+            legacy_skills = node.metadata.get("skills")
+            if not node_skills and isinstance(legacy_skills, list):
+                node_skills = [str(skill) for skill in legacy_skills]
             request = OperatorRequest(
                 operator_id=node.operator_id or node_name,
                 role=node.role or node_name,
                 objective=node.objective or f"Execute node {node_name}",
                 state_view=state_view,
                 inbox=list(inboxes.pop(node_name, [])),
                 artifacts=list(artifacts),
+                skills=node_skills,
                 working_dir=cwd,
                 tool_policy=tool_policy,
                 metadata={"graph_id": self.graph.graph_id, "node_name": node_name},
diff --git a/tests/test_graph.py b/tests/test_graph.py