Testing Agents
This tutorial demonstrates how to test AI agents that use tools, perform multi-step reasoning, and maintain state across interactions.
Introduction
Section titled “Introduction”We’ll build and test an agent that can:
- Use multiple tools (search, calculator, database)
- Plan multi-step actions to accomplish goals
- Maintain state across interactions
- Handle failures and retry with different strategies
Our tests will validate:
- Tool selection logic
- Reasoning quality
- Task completion
- Error handling
- State management
Building a Simple Agent
Section titled “Building a Simple Agent”First, let’s create an agent to test:
from typing import Literal, Optionalfrom pydantic import BaseModel
class Tool(BaseModel): name: str description: str
class AgentStep(BaseModel): thought: str tool: str tool_input: str observation: str
class AgentResponse(BaseModel): steps: list[AgentStep] final_answer: str success: bool
class SimpleAgent: def __init__(self): self.tools = { "search": Tool( name="search", description="Search the internet for information" ), "calculator": Tool( name="calculator", description="Perform mathematical calculations" ), "database": Tool( name="database", description="Query a database for structured data" ), } self.max_steps = 5
def _use_tool(self, tool_name: str, tool_input: str) -> str: """Execute a tool (simplified for testing).""" if tool_name == "search": return f"Search results for '{tool_input}': [Relevant information...]" elif tool_name == "calculator": try: result = eval(tool_input) # Don't do this in production! return str(result) except Exception as e: return f"Error: {e}" elif tool_name == "database": return f"Database query result for '{tool_input}': [Records...]" return "Unknown tool"
def run(self, task: str) -> AgentResponse: """Run the agent on a task.""" steps = []
# Simplified agent logic if "calculate" in task.lower() or any(c in task for c in "0123456789+-*/"): # Math task thought = "I need to use the calculator for this math problem" tool = "calculator" # Extract the calculation import re calculation = re.findall(r'[\d+\-*/()]+', task) tool_input = calculation[0] if calculation else task
observation = self._use_tool(tool, tool_input) steps.append(AgentStep( thought=thought, tool=tool, tool_input=tool_input, observation=observation ))
final_answer = f"The answer is {observation}" success = "Error" not in observation
elif "search" in task.lower() or "find" in task.lower(): # Search task thought = "I should search for this information" tool = "search" tool_input = task
observation = self._use_tool(tool, tool_input) steps.append(AgentStep( thought=thought, tool=tool, tool_input=tool_input, observation=observation ))
final_answer = f"Based on my search: {observation}" success = True
else: # Default case thought = "This task doesn't require tools" final_answer = "I can answer this directly: " + task success = True
return AgentResponse( steps=steps, final_answer=final_answer, success=success )Test 1: Tool Selection
Section titled “Test 1: Tool Selection”Verify that the agent selects appropriate tools:
from giskard.checks import scenario, from_fn, Equality
agent = SimpleAgent()
async def test_tool_selection(): tc = ( scenario("tool_selection_calculator") .interact( inputs="What is 15 * 23?", outputs=lambda inputs: agent.run(inputs) ) .check( from_fn( lambda trace: len(trace.last.outputs.steps) > 0, name="used_tools", success_message="Agent used tools", failure_message="Agent didn't use any tools" ) ) .check( Equality( name="selected_calculator", expected="calculator", key="trace.last.outputs.steps[0].tool" ) ) .check( from_fn( lambda trace: trace.last.outputs.success, name="task_successful", success_message="Agent completed task successfully", failure_message="Agent failed to complete task" ) ) ) result = await tc.run() assert result.passedTest 2: Reasoning Quality
Section titled “Test 2: Reasoning Quality”Evaluate the quality of the agent’s reasoning:
from giskard.agents.generators import Generatorfrom giskard.checks import scenario, LLMJudge, from_fn, set_default_generator
set_default_generator(Generator(model="openai/gpt-5-mini"))
tc = ( scenario("reasoning_quality_test") .interact( inputs="Find information about quantum computing", outputs=lambda inputs: agent.run(inputs) ) .check( LLMJudge( name="reasoning_quality", prompt=""" Evaluate the agent's reasoning process.
Task: {{ inputs }} Thought: {{ outputs.steps[0].thought if outputs.steps else "No reasoning" }} Tool Selected: {{ outputs.steps[0].tool if outputs.steps else "None" }}
Criteria: 1. Is the reasoning logical? 2. Is the tool selection appropriate for the task? 3. Does the thought explain why the tool was chosen?
Return 'passed: true' if the reasoning is sound. """ ) ) .check( from_fn( lambda trace: trace.last.outputs.steps[0].tool == "search", name="correct_tool_for_research", success_message="Selected search for research task", failure_message="Wrong tool selected" ) ))Test 3: Multi-Step Agent Workflow
Section titled “Test 3: Multi-Step Agent Workflow”Test agents that perform multiple steps:
class MultiStepAgent(SimpleAgent): def run(self, task: str) -> AgentResponse: """Run agent with multi-step capability.""" steps = []
# Example: Complex task requiring multiple tools if "research" in task.lower() and "calculate" in task.lower(): # Step 1: Search steps.append(AgentStep( thought="First, I need to search for the data", tool="search", tool_input=task, observation=self._use_tool("search", task) ))
# Step 2: Calculate steps.append(AgentStep( thought="Now I'll calculate based on the data", tool="calculator", tool_input="100 * 2", observation=self._use_tool("calculator", "100 * 2") ))
final_answer = f"Based on my research and calculations: {steps[-1].observation}" success = True else: return super().run(task)
return AgentResponse( steps=steps, final_answer=final_answer, success=success )
multi_agent = MultiStepAgent()
from giskard.checks import scenario, from_fn, LLMJudge
test_scenario = ( scenario("multi_step_agent_workflow") .interact( inputs="Research the market size and calculate projected growth", outputs=lambda inputs: multi_agent.run(inputs) ) .check( from_fn( lambda trace: len(trace.last.outputs.steps) >= 2, name="multiple_steps_taken", success_message="Agent performed multiple steps", failure_message="Agent didn't perform enough steps" ) ) .check( from_fn( lambda trace: any( step.tool == "search" for step in trace.last.outputs.steps ), name="performed_research", success_message="Agent performed research", failure_message="Agent skipped research step" ) ) .check( from_fn( lambda trace: any( step.tool == "calculator" for step in trace.last.outputs.steps ), name="performed_calculation", success_message="Agent performed calculation", failure_message="Agent skipped calculation step" ) ) .check( LLMJudge( name="steps_logical_order", prompt=""" Evaluate if the agent's steps are in a logical order.
Task: {{ interactions[0].inputs }} Steps: {% for step in interactions[0].outputs.steps %} {{ loop.index }}. {{ step.thought }} -> {{ step.tool }} {% endfor %}
Return 'passed: true' if steps are well-ordered. """ ) ))Test 4: Error Handling
Section titled “Test 4: Error Handling”Verify that agents handle errors gracefully:
class RobustAgent(SimpleAgent): def run(self, task: str) -> AgentResponse: steps = []
# Try first approach thought = "I'll try using the calculator" observation = self._use_tool("calculator", task) steps.append(AgentStep( thought=thought, tool="calculator", tool_input=task, observation=observation ))
if "Error" in observation: # Fallback strategy thought = "Calculator failed, I'll search instead" observation = self._use_tool("search", task) steps.append(AgentStep( thought=thought, tool="search", tool_input=task, observation=observation )) final_answer = f"After trying different approaches: {observation}" success = True else: final_answer = f"Result: {observation}" success = True
return AgentResponse( steps=steps, final_answer=final_answer, success=success )
robust_agent = RobustAgent()
tc = ( scenario("error_handling_test") .interact( inputs="What is the meaning of life?", # Not a valid calculation outputs=lambda inputs: robust_agent.run(inputs) ) .check( from_fn( lambda trace: len(trace.last.outputs.steps) > 1, name="tried_fallback", success_message="Agent tried fallback strategy", failure_message="Agent didn't attempt recovery" ) ) .check( from_fn( lambda trace: trace.interactions[-1].outputs.success, name="eventually_succeeded", success_message="Agent completed task despite initial failure", failure_message="Agent failed to complete task" ) ) .check( LLMJudge( name="error_recovery_appropriate", prompt=""" Evaluate if the agent's error recovery was appropriate.
Task: {{ inputs }} Steps taken: {% for step in outputs.steps %} {{ loop.index }}. {{ step.thought }} ({{ step.tool }}) Result: {{ step.observation }} {% endfor %}
Return 'passed: true' if the agent handled the error well. """ ) ))Test 5: Stateful Agent Interactions
Section titled “Test 5: Stateful Agent Interactions”Test agents that maintain state across turns:
class StatefulAgent(SimpleAgent): def __init__(self): super().__init__() self.memory = {} self.conversation_history = []
def run(self, task: str) -> AgentResponse: # Check memory for context if "last" in task.lower() or "previous" in task.lower(): if self.conversation_history: prev_task = self.conversation_history[-1]["task"] thought = f"Recalling previous task: {prev_task}" observation = f"Previous task was: {prev_task}" final_answer = f"I remember: {observation}"
steps = [AgentStep( thought=thought, tool="memory", tool_input="recall", observation=observation )]
self.conversation_history.append({ "task": task, "response": final_answer })
return AgentResponse( steps=steps, final_answer=final_answer, success=True )
# Handle new task response = super().run(task) self.conversation_history.append({ "task": task, "response": response.final_answer }) return response
stateful_agent = StatefulAgent()
test_scenario = ( scenario("stateful_agent_memory") # First interaction .interact( inputs="Search for Python tutorials", outputs=lambda inputs: stateful_agent.run(inputs) ) .check( from_fn( lambda trace: trace.interactions[-1].outputs.success, name="first_task_completed" ) )
# Second interaction references first .interact( inputs="What was my last request?", outputs=lambda inputs: stateful_agent.run(inputs) ) .check( from_fn( lambda trace: "Python tutorials" in trace.last.outputs.final_answer, name="recalls_previous_task", success_message="Agent correctly recalled previous task", failure_message="Agent failed to recall previous task" ) ) .check( LLMJudge( name="context_maintained", prompt=""" Evaluate if the agent maintained context correctly.
First task: {{ interactions[0].inputs }} Second task: {{ interactions[1].inputs }} Second response: {{ interactions[1].outputs.final_answer }}
The second response should reference the first task. Return 'passed: true' if context was maintained. """ ) ))Test 6: Task Completion Validation
Section titled “Test 6: Task Completion Validation”Verify that complex tasks are fully completed:
from giskard.checks import scenario, LLMJudge, from_fn
class TaskTrackingAgent(SimpleAgent): def __init__(self): super().__init__() self.pending_tasks = [] self.completed_tasks = []
def run(self, task: str) -> AgentResponse: if "add task" in task.lower(): task_desc = task.replace("add task", "").strip() self.pending_tasks.append(task_desc) return AgentResponse( steps=[], final_answer=f"Added task: {task_desc}. Pending: {len(self.pending_tasks)}", success=True )
elif "complete" in task.lower(): if self.pending_tasks: completed = self.pending_tasks.pop(0) self.completed_tasks.append(completed)
return AgentResponse( steps=[AgentStep( thought=f"Completing task: {completed}", tool="task_manager", tool_input=completed, observation="Task completed successfully" )], final_answer=f"Completed: {completed}", success=True ) return AgentResponse( steps=[], final_answer="No pending tasks to complete", success=False )
elif "status" in task.lower(): return AgentResponse( steps=[], final_answer=f"Pending: {len(self.pending_tasks)}, Completed: {len(self.completed_tasks)}", success=True )
return super().run(task)
task_agent = TaskTrackingAgent()
test_scenario = ( scenario("task_completion_workflow") # Add tasks .interact( inputs="add task: Write documentation", outputs=lambda inputs: task_agent.run(inputs) ) .interact( inputs="add task: Review code", outputs=lambda inputs: task_agent.run(inputs) ) .check( from_fn( lambda trace: len(task_agent.pending_tasks) == 2, name="tasks_added" ) )
# Complete first task .interact( inputs="complete next task", outputs=lambda inputs: task_agent.run(inputs) ) .check( from_fn( lambda trace: len(task_agent.completed_tasks) == 1, name="task_completed" ) )
# Check status .interact( inputs="what's the status?", outputs=lambda inputs: task_agent.run(inputs) ) .check( from_fn( lambda trace: ( "Pending: 1" in trace.last.outputs.final_answer and "Completed: 1" in trace.last.outputs.final_answer ), name="status_accurate", success_message="Agent tracking state correctly", failure_message="Agent state tracking is incorrect" ) ))Complete Agent Test Suite
Section titled “Complete Agent Test Suite”Combine all tests into a comprehensive suite:
import asynciofrom typing import Listfrom giskard.checks import scenario
class AgentTestSuite: def __init__(self, agent): self.agent = agent self.test_cases = [] self.scenarios = []
def add_test(self, test_case): self.test_cases.append(test_case)
def add_scenario(self, test_scenario): self.scenarios.append(test_scenario)
async def run_all(self): """Run all tests and scenarios.""" results = []
print("Running test cases...") for tc in self.test_cases: result = await tc.run() results.append(("test", tc.name, result))
print("Running scenarios...") for test_scenario in self.scenarios: result = await test_scenario.run() results.append(("scenario", test_scenario.name, result))
# Report self._report_results(results)
return results
def _report_results(self, results): total = len(results) passed = sum(1 for _, _, r in results if r.passed)
print(f"\n{'='*60}") print(f"Agent Test Suite Results: {passed}/{total} passed ({passed/total*100:.1f}%)") print(f"{'='*60}\n")
for test_type, name, result in results: status = "✓" if result.passed else "✗" print(f" {status} [{test_type}] {name}")
if not result.passed: if hasattr(result, 'results'): for check_result in result.results: if not check_result.passed: print(f" ↳ {check_result.name}: {check_result.message}") elif hasattr(result, 'message'): print(f" ↳ {result.message}")
# Usageasync def main(): agent = SimpleAgent() suite = AgentTestSuite(agent)
# Add tests (from examples above) # suite.add_test(...) # suite.add_scenario(...)
await suite.run_all()
asyncio.run(main())Best Practices
Section titled “Best Practices”1. Test Tool Selection Logic Independently
Before testing full workflows, validate tool selection:
def test_tool_selection_logic(): test_cases = [ ("Calculate 5 + 3", "calculator"), ("Search for recipes", "search"), ("Query user database", "database"), ]
for task, expected_tool in test_cases: response = agent.run(task) assert response.steps[0].tool == expected_tool2. Validate Reasoning at Each Step
Use LLM judges to evaluate reasoning quality:
LLMJudge( name="step_reasoning", prompt="Is this reasoning step logical? {{ outputs.steps[0].thought }}")3. Test Error Paths
Ensure agents handle failures gracefully:
# Test with invalid tool inputs# Test with unavailable tools# Test with contradictory instructions4. Monitor Resource Usage
Track token usage, API calls, and execution time:
checks = [ from_fn( lambda trace: len(trace.last.outputs.steps) <= 5, name="reasonable_step_count", success_message="Used reasonable number of steps" ),]Next Steps
Section titled “Next Steps”- See Chatbot Testing for conversational agent patterns
- Explore RAG Evaluation for knowledge-grounded agents
- Review Multi-Turn Scenarios for complex workflows