Chatbot Testing
This tutorial covers testing conversational AI systems, including context handling, tone consistency, and multi-turn dialogue flows.
Introduction
Section titled “Introduction”We’ll test a chatbot that:
- Maintains conversation context
- Handles different conversation types (casual, support, sales)
- Manages user preferences and information
- Provides appropriate responses based on context
Our tests will validate:
- Context retention across turns
- Response quality and tone
- Handling of conversation flow
- Edge cases and error scenarios
Building a Chatbot
Section titled “Building a Chatbot”First, let’s create a simple chatbot:
from typing import Optional, Literalfrom pydantic import BaseModel
class Message(BaseModel): role: Literal["user", "assistant", "system"] content: str
class ConversationContext(BaseModel): user_name: Optional[str] = None user_email: Optional[str] = None conversation_type: str = "casual" topic: Optional[str] = None
class ChatResponse(BaseModel): message: str context: ConversationContext suggested_actions: list[str] = []
class SimpleChatbot: def __init__(self, personality: str = "friendly"): self.personality = personality self.history: list[Message] = [] self.context = ConversationContext()
def chat(self, user_message: str) -> ChatResponse: """Process user message and generate response.""" self.history.append(Message(role="user", content=user_message))
# Update context based on message self._update_context(user_message)
# Generate response response_text = self._generate_response(user_message)
self.history.append(Message(role="assistant", content=response_text))
return ChatResponse( message=response_text, context=self.context.model_copy(), suggested_actions=self._suggest_actions() )
def _update_context(self, message: str): """Extract and update context information.""" message_lower = message.lower()
# Extract name if "my name is" in message_lower or "i'm" in message_lower: words = message.split() for i, word in enumerate(words): if word.lower() in ["is", "i'm", "im"] and i + 1 < len(words): self.context.user_name = words[i + 1].strip(",.!?") break
# Extract email if "@" in message: import re emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', message) if emails: self.context.user_email = emails[0]
# Detect conversation type if any(word in message_lower for word in ["help", "support", "problem", "issue"]): self.context.conversation_type = "support" elif any(word in message_lower for word in ["buy", "purchase", "price", "cost"]): self.context.conversation_type = "sales"
def _generate_response(self, message: str) -> str: """Generate appropriate response based on context.""" # Greeting if any(greeting in message.lower() for greeting in ["hello", "hi", "hey"]): if self.context.user_name: return f"Hello {self.context.user_name}! How can I help you today?" return "Hello! How can I help you today?"
# Name recall if "what is my name" in message.lower() or "do you know my name" in message.lower(): if self.context.user_name: return f"Yes, your name is {self.context.user_name}." return "I don't believe you've told me your name yet."
# Support queries if self.context.conversation_type == "support": return "I understand you need help. Let me connect you with our support team. Could you describe your issue in detail?"
# Sales queries if self.context.conversation_type == "sales": return "I'd be happy to help you find the right product. What are you looking for?"
# Default return "I understand. Could you tell me more about that?"
def _suggest_actions(self) -> list[str]: """Suggest next actions based on context.""" actions = []
if not self.context.user_name: actions.append("introduce_yourself")
if self.context.conversation_type == "support" and not self.context.user_email: actions.append("provide_email")
return actionsTest 1: Basic Conversation Flow
Section titled “Test 1: Basic Conversation Flow”Test a simple greeting and name exchange:
from giskard.checks import scenario, from_fn, StringMatching
bot = SimpleChatbot()
test_scenario = ( scenario("greeting_and_introduction") # User greets .interact( inputs="Hello", outputs=lambda inputs: bot.chat(inputs) ) .check( StringMatching( name="polite_greeting", content="help", key="trace.last.outputs.message" ) )
# User introduces themselves .interact( inputs="My name is Alice", outputs=lambda inputs: bot.chat(inputs) ) .check( StringMatching( name="acknowledges_name", content="Alice", key="trace.last.outputs.message" ) ) .check( from_fn( lambda trace: trace.last.outputs.context.user_name == "Alice", name="stored_name", success_message="Chatbot stored the user's name", failure_message="Chatbot failed to store name" ) )
# Verify name recall .interact( inputs="What is my name?", outputs=lambda inputs: bot.chat(inputs) ) .check( StringMatching( name="recalls_name", content="Alice", key="trace.last.outputs.message" ) ))
import asyncio
async def test_basic_conversation(): result = await test_scenario.run() assert result.passed print("✓ Basic conversation flow test passed")
asyncio.run(test_basic_conversation())Test 2: Context Switching
Section titled “Test 2: Context Switching”Verify the chatbot handles different conversation types:
from giskard.agents.generators import Generatorfrom giskard.checks import ( scenario, LLMJudge, Equality, set_default_generator)
set_default_generator(Generator(model="openai/gpt-5-mini"))
bot = SimpleChatbot()
test_scenario = ( scenario("context_switching") # Start with casual conversation .interact( inputs="Hi there!", outputs=lambda inputs: bot.chat(inputs) ) .check( Equality( name="casual_context", expected="casual", key="trace.last.outputs.context.conversation_type" ) )
# Switch to support .interact( inputs="I'm having a problem with my account", outputs=lambda inputs: bot.chat(inputs) ) .check( Equality( name="support_context", expected="support", key="trace.last.outputs.context.conversation_type" ) ) .check( LLMJudge( name="support_tone", prompt=""" Evaluate if the response is appropriate for a support inquiry.
User: {{ interactions[1].inputs }} Assistant: {{ interactions[1].outputs.message }}
The response should be helpful and professional. Return 'passed: true' if appropriate. """ ) )
# Switch to sales .interact( inputs="How much does it cost?", outputs=lambda inputs: bot.chat(inputs) ) .check( Equality( name="sales_context", expected="sales", key="trace.last.outputs.context.conversation_type" ) ))Test 3: Response Quality and Tone
Section titled “Test 3: Response Quality and Tone”Evaluate response quality using LLM-as-a-judge:
from giskard.checks import scenario, LLMJudge
bot = SimpleChatbot(personality="professional")
tc = ( scenario("response_quality_test") .interact( inputs="I need help understanding your pricing", outputs=lambda inputs: bot.chat(inputs) ) .check( LLMJudge( name="tone_check", prompt=""" Evaluate the tone of this chatbot response.
User message: {{ inputs }} Bot response: {{ outputs.message }} Expected personality: professional
Check: 1. Is the tone professional? 2. Is it helpful and clear? 3. Does it address the user's question?
Return 'passed: true' if tone is appropriate. """ ) ) .check( LLMJudge( name="completeness", prompt=""" Evaluate if the response is complete.
User: {{ inputs }} Bot: {{ outputs.message }}
Does the response: 1. Acknowledge the user's question? 2. Provide next steps or information? 3. Offer to help further?
Return 'passed: true' if response is complete. """ ) ))Test 4: Information Extraction and Storage
Section titled “Test 4: Information Extraction and Storage”Test the chatbot’s ability to extract and remember user information:
from giskard.checks import scenario, from_fn, Equality
bot = SimpleChatbot()
test_scenario = ( scenario("information_collection") # Collect name .interact( inputs="Hi, I'm Bob Johnson", outputs=lambda inputs: bot.chat(inputs) ) .check( Equality( name="extracted_name", expected="Bob", key="trace.last.outputs.context.user_name" ) )
# Collect email .interact( inputs="My email is bob.johnson@example.com", outputs=lambda inputs: bot.chat(inputs) ) .check( Equality( name="extracted_email", expected="bob.johnson@example.com", key="trace.last.outputs.context.user_email" ) )
# Verify information persists .interact( inputs="Can you remind me what information you have about me?", outputs=lambda inputs: bot.chat(inputs) ) .check( from_fn( lambda trace: ( trace.last.outputs.context.user_name == "Bob" and trace.last.outputs.context.user_email == "bob.johnson@example.com" ), name="information_persisted", success_message="Chatbot retained user information", failure_message="Chatbot lost user information" ) ))Test 5: Edge Cases and Error Handling
Section titled “Test 5: Edge Cases and Error Handling”Test how the chatbot handles unusual inputs:
from giskard.checks import scenario, from_fn, LLMJudge
bot = SimpleChatbot()
# Test empty inputtc_empty = ( scenario("empty_input_handling") .interact( inputs="", outputs=lambda inputs: bot.chat(inputs) if inputs else ChatResponse( message="I didn't receive a message. Could you try again?", context=bot.context ) ) .check( from_fn( lambda trace: len(trace.last.outputs.message) > 0, name="provides_response", success_message="Bot provided a response to empty input" ) ))
# Test very long inputtc_long = ( scenario("long_input_handling") .interact( inputs="Hello " * 1000, outputs=lambda inputs: bot.chat(inputs) ) .check( from_fn( lambda trace: len(trace.last.outputs.message) > 0, name="handles_long_input", success_message="Bot handled long input" ) ))
# Test gibberishtc_gibberish = ( scenario("gibberish_handling") .interact( inputs="asdfghjkl qwertyuiop zxcvbnm", outputs=lambda inputs: bot.chat(inputs) ) .check( LLMJudge( name="graceful_response", prompt=""" Evaluate if the bot handles gibberish gracefully.
User input: {{ inputs }} Bot response: {{ outputs.message }}
The bot should: 1. Not error out 2. Provide a polite response 3. Maybe ask for clarification
Return 'passed: true' if handled well. """ ) ))Test 6: Conversation State Management
Section titled “Test 6: Conversation State Management”Test complex stateful interactions:
from giskard.checks import scenario, from_fn, LLMJudge, StringMatching
class StatefulChatbot(SimpleChatbot): def __init__(self): super().__init__() self.awaiting_confirmation = False self.pending_action = None
def chat(self, user_message: str) -> ChatResponse: # Handle confirmations if self.awaiting_confirmation: if user_message.lower() in ["yes", "confirm", "ok", "sure"]: response_text = f"Great! I'll proceed with {self.pending_action}." self.awaiting_confirmation = False self.pending_action = None elif user_message.lower() in ["no", "cancel", "nevermind"]: response_text = "Okay, I won't do that. What else can I help with?" self.awaiting_confirmation = False self.pending_action = None else: response_text = "I'm waiting for your confirmation. Please say yes or no."
self.history.append(Message(role="assistant", content=response_text)) return ChatResponse(message=response_text, context=self.context)
# Check for actions requiring confirmation if "delete" in user_message.lower() or "cancel" in user_message.lower(): self.awaiting_confirmation = True self.pending_action = "deletion" response_text = "Are you sure you want to proceed? Please confirm."
self.history.append(Message(role="assistant", content=response_text)) return ChatResponse(message=response_text, context=self.context)
return super().chat(user_message)
stateful_bot = StatefulChatbot()
test_scenario = ( scenario("confirmation_flow") # Request action requiring confirmation .interact( inputs="I want to delete my account", outputs=lambda inputs: stateful_bot.chat(inputs) ) .check( from_fn( lambda trace: stateful_bot.awaiting_confirmation, name="requested_confirmation", success_message="Bot requested confirmation" ) ) .check( StringMatching( name="asks_confirmation", content="confirm", key="trace.last.outputs.message" ) )
# User cancels .interact( inputs="No, nevermind", outputs=lambda inputs: stateful_bot.chat(inputs) ) .check( from_fn( lambda trace: not stateful_bot.awaiting_confirmation, name="cleared_confirmation_state", success_message="Bot cleared confirmation state" ) ) .check( LLMJudge( name="acknowledged_cancellation", prompt=""" Check if the bot acknowledged the cancellation appropriately.
User: {{ interactions[1].inputs }} Bot: {{ interactions[1].outputs.message }}
Return 'passed: true' if the bot handled cancellation well. """ ) ))Complete Chatbot Test Suite
Section titled “Complete Chatbot Test Suite”Combine all tests into a comprehensive suite:
import asynciofrom giskard.checks import scenario
class ChatbotTestSuite: def __init__(self, chatbot): self.chatbot = chatbot self.scenarios = [] self.test_cases = []
def add_scenario(self, test_scenario): self.scenarios.append(test_scenario)
def add_test(self, test_case): self.test_cases.append(test_case)
async def run_all(self): """Run all tests and report results.""" print("🤖 Running Chatbot Test Suite\n")
results = []
# Run scenarios for test_scenario in self.scenarios: print(f" Running scenario: {test_scenario.name}") result = await test_scenario.run() results.append(("scenario", test_scenario.name, result))
# Run test cases for tc in self.test_cases: print(f" Running test: {tc.name}") result = await tc.run() results.append(("test", tc.name, result))
# Report self._print_report(results)
return results
def _print_report(self, results): total = len(results) passed = sum(1 for _, _, r in results if r.passed)
print(f"\n{'='*70}") print(f"Results: {passed}/{total} passed ({passed/total*100:.1f}%)") print(f"{'='*70}\n")
for test_type, name, result in results: status = "✓" if result.passed else "✗" print(f"{status} [{test_type}] {name}")
if not result.passed: if hasattr(result, 'results'): for check_result in result.results: if not check_result.passed: print(f" → {check_result.name}: {check_result.message}")
# Usageasync def main(): bot = SimpleChatbot() suite = ChatbotTestSuite(bot)
# Add all scenarios and tests # suite.add_scenario(...) # suite.add_test(...)
await suite.run_all()
asyncio.run(main())Best Practices
Section titled “Best Practices”1. Test Conversation Flows Holistically
Don’t just test individual responses—test complete conversation flows:
test_scenario = ( scenario("complete_support_flow") # Greeting -> Problem statement -> Information collection -> Resolution ...)2. Validate Context Retention
Ensure the chatbot remembers important information:
from_fn( lambda trace: ( trace.last.outputs.context.user_name and trace.last.outputs.context.user_email ), name="retains_user_info")3. Test Tone Consistency
Use LLM judges to verify tone remains consistent:
LLMJudge( name="consistent_tone", prompt=""" Evaluate tone consistency across responses.
{% for interaction in interactions %} Response {{ loop.index }}: {{ interaction.outputs.message }} {% endfor %}
Return 'passed: true' if tone is consistent. """)4. Handle Edge Cases
Test with unusual inputs:
- Empty messages
- Very long messages
- Special characters
- Rapid topic changes
- Contradictory statements
Next Steps
Section titled “Next Steps”- See Content Moderation for safety and filtering
- Explore Testing Agents for tool-using chatbots
- Review Multi-Turn Scenarios for complex flows