Testing

Agentspan has a first-class testing module that lets you test agent behavior without making LLM API calls or running a server. Tests are deterministic, fast, and runnable in CI.

Import

from agentspan.agents.testing import mock_run, MockEvent, expect, record, replay

mock_run

mock_run runs an agent with a scripted sequence of events instead of calling an LLM:

from agentspan.agents import Agent, tool
from agentspan.agents.testing import mock_run, MockEvent, expect

@tool
def search_web(query: str) -> str:
    """Search the web."""
    return "results"

agent = Agent(
    name="research_bot",
    model="openai/gpt-4o",
    tools=[search_web],
)

result = mock_run(
    agent,
    "What is agentspan?",
    events=[
        MockEvent.thinking("I should search for information about agentspan."),
        MockEvent.tool_call("search_web", {"query": "agentspan Python agent runtime"}),
        MockEvent.tool_result("search_web", "Agentspan is an open source Python runtime for AI agents."),
        MockEvent.done("Agentspan is an open source Python runtime for building AI agents."),
    ]
)

expect(result).completed().output_contains("Agentspan").used_tool("search_web")

mock_run signature:

mock_run(
    agent: Agent,
    prompt: str,
    events: list[MockEvent],
    context_state: dict | None = None,   # initial ToolContext.state
) -> AgentResult

MockEvent factory methods

MethodDescription
MockEvent.thinking(content)Agent internal reasoning step
MockEvent.tool_call(tool_name, args)LLM calls a tool with given arguments
MockEvent.tool_result(tool_name, result)Tool returns a result
MockEvent.message(content)Agent sends an intermediate message
MockEvent.handoff(target)Control passes to a named agent
MockEvent.guardrail_pass(name)A guardrail passes
MockEvent.guardrail_fail(name, message)A guardrail fails (triggers retry/fix/etc.)
MockEvent.waiting(content)Agent pauses for human approval
MockEvent.error(message)An error occurs
MockEvent.done(output)Agent completes with this output

expect() fluent assertions

Chain assertions on the result:

expect(result)
    .completed()                          # status is COMPLETED
    .output_contains("Paris")             # output contains substring
    .output_matches(r"capital.*France")   # output matches regex
    .used_tool("search_web")              # tool was called at least once
    .used_tool("search_web", args={"query": "capital of France"})  # with specific args
    .max_turns(5)                         # used at most 5 turns
    .no_errors()                          # no error events
    .guardrail_passed("no_pii")           # named guardrail passed

All assertions raise AssertionError with descriptive messages on failure. You can also assert failure cases:

expect(result).failed()                          # status is FAILED
expect(result).guardrail_failed("safety_check")  # named guardrail failed

Function-based assertions

For more complex checks, inspect the result directly:

result = mock_run(agent, "prompt", events=[...])

# Check output
assert "expected phrase" in result.output
assert result.status == "COMPLETED"

# Check tool calls
tool_calls = [t for t in result.tool_calls if t['name'] == "search_web"]
assert len(tool_calls) == 1
assert tool_calls[0]['args']["query"] == "expected query"

# Check structured output (mock_run returns a string — parse JSON if needed)
import json
data = json.loads(result.output)
assert data["city"] == "San Francisco"

Testing tool side effects

Test that tools are called with correct arguments and that state is managed properly:

from agentspan.agents import Agent, tool
from agentspan.agents.testing import mock_run, MockEvent, expect

@tool
def send_email(to: str, subject: str, body: str) -> dict:
    """Send an email."""
    return {"sent": True, "to": to}

agent = Agent(
    name="email_bot",
    model="openai/gpt-4o",
    tools=[send_email],
)

result = mock_run(
    agent,
    "Send a welcome email to alice@example.com",
    events=[
        MockEvent.tool_call("send_email", {
            "to": "alice@example.com",
            "subject": "Welcome!",
            "body": "Welcome to our platform...",
        }),
        MockEvent.tool_result("send_email", {"sent": True, "to": "alice@example.com"}),
        MockEvent.done("Email sent successfully to alice@example.com."),
    ]
)

expect(result).completed().used_tool("send_email", args={
    "to": "alice@example.com",
    "subject": "Welcome!",
})

Testing HITL flows

Test human-in-the-loop interactions:

from agentspan.agents import Agent, tool
from agentspan.agents.testing import mock_run, MockEvent, expect

@tool(approval_required=True)
def delete_file(path: str) -> dict:
    """Delete a file. Requires approval."""
    return {"deleted": True, "path": path}

agent = Agent(name="file_manager", model="openai/gpt-4o", tools=[delete_file])

# Test the approval path
result = mock_run(
    agent,
    "Delete /tmp/old-logs.txt",
    events=[
        MockEvent.tool_call("delete_file", {"path": "/tmp/old-logs.txt"}),
        MockEvent.waiting("Agent wants to delete /tmp/old-logs.txt. Approve?"),
        MockEvent.tool_result("delete_file", {"deleted": True, "path": "/tmp/old-logs.txt"}),
        MockEvent.done("File /tmp/old-logs.txt has been deleted."),
    ]
)
expect(result).completed().used_tool("delete_file")

# Test the rejection path
result = mock_run(
    agent,
    "Delete /etc/hosts",
    events=[
        MockEvent.tool_call("delete_file", {"path": "/etc/hosts"}),
        MockEvent.waiting("Agent wants to delete /etc/hosts. Approve?"),
        MockEvent.done("I was not able to delete /etc/hosts — the action was denied."),
    ]
)
expect(result).completed().output_contains("denied")

Testing multi-agent pipelines

Test sequential pipelines:

researcher = Agent(name="researcher", model="openai/gpt-4o",
                   tools=[search_web], instructions="Research the topic.")
writer = Agent(name="writer", model="openai/gpt-4o",
               instructions="Write an article.")

pipeline = researcher >> writer

result = mock_run(
    pipeline,
    "Write about Python asyncio",
    events=[
        # Researcher turn
        MockEvent.tool_call("search_web", {"query": "Python asyncio overview"}),
        MockEvent.tool_result("search_web", "asyncio is Python's async I/O framework..."),
        MockEvent.handoff("writer"),
        # Writer turn
        MockEvent.done("# Python asyncio\nasyncio enables concurrent code using async/await..."),
    ]
)

expect(result).completed().used_tool("search_web")

Record and replay

Record a real execution (with an actual LLM) and replay it deterministically in tests:

from agentspan.agents.testing import record, replay

# Record a real run (calls LLM)
recording = record(agent, "What's the capital of France?")
recording.save("tests/fixtures/capital_query.json")

# Replay it (no LLM, no server)
result = replay("tests/fixtures/capital_query.json")
expect(result).completed().output_contains("Paris")

This is useful for:

  • Capturing known-good behavior as regression tests
  • Running existing test cases against new model versions
  • Debugging: record a failing production execution, replay locally

pytest integration

import pytest
from agentspan.agents.testing import mock_run, MockEvent, expect

# Mark as unit test — no LLM, no server, fast
class TestWeatherAgent:
    def test_weather_query(self, weather_agent):
        result = mock_run(
            weather_agent,
            "Weather in NYC?",
            events=[
                MockEvent.tool_call("get_weather", {"city": "New York"}),
                MockEvent.tool_result("get_weather", {"temp_f": 65, "condition": "Cloudy"}),
                MockEvent.done("New York is currently 65°F and cloudy."),
            ]
        )
        expect(result).completed().output_contains("65").used_tool("get_weather")

    def test_handles_unknown_city(self, weather_agent):
        result = mock_run(
            weather_agent,
            "Weather in Atlantis?",
            events=[
                MockEvent.tool_call("get_weather", {"city": "Atlantis"}),
                MockEvent.tool_result("get_weather", {"error": "City not found"}),
                MockEvent.done("I couldn't find weather data for Atlantis."),
            ]
        )
        expect(result).completed().output_contains("couldn't find")

# Mark as integration test — calls real LLM, requires server
@pytest.mark.integration
class TestWeatherAgentIntegration:
    def test_real_weather_query(self, weather_agent):
        from agentspan.agents import run
        result = run(weather_agent, "Weather in San Francisco?")
        assert result.status == "COMPLETED"
        assert len(result.output) > 10

Run unit tests only:

pytest tests/ -m "not integration"

Run integration tests:

pytest tests/ -m integration

Evaluating output correctness

For evaluating LLM output quality (not just structure), use CorrectnessEval:

from agentspan.agents import AgentRuntime
from agentspan.agents.testing import CorrectnessEval, EvalCase

eval_runner = CorrectnessEval(runtime=AgentRuntime())

cases = [
    EvalCase(
        name="capital_of_france",
        agent=agent,
        prompt="What is the capital of France?",
        expect_output_contains=["Paris"],
        expect_status="COMPLETED",
    ),
]

suite = eval_runner.run(cases)
suite.print_summary()
assert suite.all_passed