Skip to content
GitHubDiscord

Chatbot Testing

This tutorial covers testing conversational AI systems, including context handling, tone consistency, and multi-turn dialogue flows.

We’ll test a chatbot that:

  • Maintains conversation context
  • Handles different conversation types (casual, support, sales)
  • Manages user preferences and information
  • Provides appropriate responses based on context

Our tests will validate:

  • Context retention across turns
  • Response quality and tone
  • Handling of conversation flow
  • Edge cases and error scenarios

First, let’s create a simple chatbot:

from typing import Optional, Literal
from pydantic import BaseModel
class Message(BaseModel):
role: Literal["user", "assistant", "system"]
content: str
class ConversationContext(BaseModel):
user_name: Optional[str] = None
user_email: Optional[str] = None
conversation_type: str = "casual"
topic: Optional[str] = None
class ChatResponse(BaseModel):
message: str
context: ConversationContext
suggested_actions: list[str] = []
class SimpleChatbot:
def __init__(self, personality: str = "friendly"):
self.personality = personality
self.history: list[Message] = []
self.context = ConversationContext()
def chat(self, user_message: str) -> ChatResponse:
"""Process user message and generate response."""
self.history.append(Message(role="user", content=user_message))
# Update context based on message
self._update_context(user_message)
# Generate response
response_text = self._generate_response(user_message)
self.history.append(Message(role="assistant", content=response_text))
return ChatResponse(
message=response_text,
context=self.context.model_copy(),
suggested_actions=self._suggest_actions()
)
def _update_context(self, message: str):
"""Extract and update context information."""
message_lower = message.lower()
# Extract name
if "my name is" in message_lower or "i'm" in message_lower:
words = message.split()
for i, word in enumerate(words):
if word.lower() in ["is", "i'm", "im"] and i + 1 < len(words):
self.context.user_name = words[i + 1].strip(",.!?")
break
# Extract email
if "@" in message:
import re
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', message)
if emails:
self.context.user_email = emails[0]
# Detect conversation type
if any(word in message_lower for word in ["help", "support", "problem", "issue"]):
self.context.conversation_type = "support"
elif any(word in message_lower for word in ["buy", "purchase", "price", "cost"]):
self.context.conversation_type = "sales"
def _generate_response(self, message: str) -> str:
"""Generate appropriate response based on context."""
# Greeting
if any(greeting in message.lower() for greeting in ["hello", "hi", "hey"]):
if self.context.user_name:
return f"Hello {self.context.user_name}! How can I help you today?"
return "Hello! How can I help you today?"
# Name recall
if "what is my name" in message.lower() or "do you know my name" in message.lower():
if self.context.user_name:
return f"Yes, your name is {self.context.user_name}."
return "I don't believe you've told me your name yet."
# Support queries
if self.context.conversation_type == "support":
return "I understand you need help. Let me connect you with our support team. Could you describe your issue in detail?"
# Sales queries
if self.context.conversation_type == "sales":
return "I'd be happy to help you find the right product. What are you looking for?"
# Default
return "I understand. Could you tell me more about that?"
def _suggest_actions(self) -> list[str]:
"""Suggest next actions based on context."""
actions = []
if not self.context.user_name:
actions.append("introduce_yourself")
if self.context.conversation_type == "support" and not self.context.user_email:
actions.append("provide_email")
return actions

Test a simple greeting and name exchange:

from giskard.checks import scenario, from_fn, StringMatching
bot = SimpleChatbot()
test_scenario = (
scenario("greeting_and_introduction")
# User greets
.interact(
inputs="Hello",
outputs=lambda inputs: bot.chat(inputs)
)
.check(
StringMatching(
name="polite_greeting",
content="help",
key="trace.last.outputs.message"
)
)
# User introduces themselves
.interact(
inputs="My name is Alice",
outputs=lambda inputs: bot.chat(inputs)
)
.check(
StringMatching(
name="acknowledges_name",
content="Alice",
key="trace.last.outputs.message"
)
)
.check(
from_fn(
lambda trace: trace.last.outputs.context.user_name == "Alice",
name="stored_name",
success_message="Chatbot stored the user's name",
failure_message="Chatbot failed to store name"
)
)
# Verify name recall
.interact(
inputs="What is my name?",
outputs=lambda inputs: bot.chat(inputs)
)
.check(
StringMatching(
name="recalls_name",
content="Alice",
key="trace.last.outputs.message"
)
)
)
import asyncio
async def test_basic_conversation():
result = await test_scenario.run()
assert result.passed
print("✓ Basic conversation flow test passed")
asyncio.run(test_basic_conversation())

Verify the chatbot handles different conversation types:

from giskard.agents.generators import Generator
from giskard.checks import (
scenario,
LLMJudge,
Equality,
set_default_generator
)
set_default_generator(Generator(model="openai/gpt-5-mini"))
bot = SimpleChatbot()
test_scenario = (
scenario("context_switching")
# Start with casual conversation
.interact(
inputs="Hi there!",
outputs=lambda inputs: bot.chat(inputs)
)
.check(
Equality(
name="casual_context",
expected="casual",
key="trace.last.outputs.context.conversation_type"
)
)
# Switch to support
.interact(
inputs="I'm having a problem with my account",
outputs=lambda inputs: bot.chat(inputs)
)
.check(
Equality(
name="support_context",
expected="support",
key="trace.last.outputs.context.conversation_type"
)
)
.check(
LLMJudge(
name="support_tone",
prompt="""
Evaluate if the response is appropriate for a support inquiry.
User: {{ interactions[1].inputs }}
Assistant: {{ interactions[1].outputs.message }}
The response should be helpful and professional.
Return 'passed: true' if appropriate.
"""
)
)
# Switch to sales
.interact(
inputs="How much does it cost?",
outputs=lambda inputs: bot.chat(inputs)
)
.check(
Equality(
name="sales_context",
expected="sales",
key="trace.last.outputs.context.conversation_type"
)
)
)

Evaluate response quality using LLM-as-a-judge:

from giskard.checks import scenario, LLMJudge
bot = SimpleChatbot(personality="professional")
tc = (
scenario("response_quality_test")
.interact(
inputs="I need help understanding your pricing",
outputs=lambda inputs: bot.chat(inputs)
)
.check(
LLMJudge(
name="tone_check",
prompt="""
Evaluate the tone of this chatbot response.
User message: {{ inputs }}
Bot response: {{ outputs.message }}
Expected personality: professional
Check:
1. Is the tone professional?
2. Is it helpful and clear?
3. Does it address the user's question?
Return 'passed: true' if tone is appropriate.
"""
)
)
.check(
LLMJudge(
name="completeness",
prompt="""
Evaluate if the response is complete.
User: {{ inputs }}
Bot: {{ outputs.message }}
Does the response:
1. Acknowledge the user's question?
2. Provide next steps or information?
3. Offer to help further?
Return 'passed: true' if response is complete.
"""
)
)
)

Test 4: Information Extraction and Storage

Section titled “Test 4: Information Extraction and Storage”

Test the chatbot’s ability to extract and remember user information:

from giskard.checks import scenario, from_fn, Equality
bot = SimpleChatbot()
test_scenario = (
scenario("information_collection")
# Collect name
.interact(
inputs="Hi, I'm Bob Johnson",
outputs=lambda inputs: bot.chat(inputs)
)
.check(
Equality(
name="extracted_name",
expected="Bob",
key="trace.last.outputs.context.user_name"
)
)
# Collect email
.interact(
inputs="My email is bob.johnson@example.com",
outputs=lambda inputs: bot.chat(inputs)
)
.check(
Equality(
name="extracted_email",
expected="bob.johnson@example.com",
key="trace.last.outputs.context.user_email"
)
)
# Verify information persists
.interact(
inputs="Can you remind me what information you have about me?",
outputs=lambda inputs: bot.chat(inputs)
)
.check(
from_fn(
lambda trace: (
trace.last.outputs.context.user_name == "Bob" and
trace.last.outputs.context.user_email == "bob.johnson@example.com"
),
name="information_persisted",
success_message="Chatbot retained user information",
failure_message="Chatbot lost user information"
)
)
)

Test how the chatbot handles unusual inputs:

from giskard.checks import scenario, from_fn, LLMJudge
bot = SimpleChatbot()
# Test empty input
tc_empty = (
scenario("empty_input_handling")
.interact(
inputs="",
outputs=lambda inputs: bot.chat(inputs) if inputs else ChatResponse(
message="I didn't receive a message. Could you try again?",
context=bot.context
)
)
.check(
from_fn(
lambda trace: len(trace.last.outputs.message) > 0,
name="provides_response",
success_message="Bot provided a response to empty input"
)
)
)
# Test very long input
tc_long = (
scenario("long_input_handling")
.interact(
inputs="Hello " * 1000,
outputs=lambda inputs: bot.chat(inputs)
)
.check(
from_fn(
lambda trace: len(trace.last.outputs.message) > 0,
name="handles_long_input",
success_message="Bot handled long input"
)
)
)
# Test gibberish
tc_gibberish = (
scenario("gibberish_handling")
.interact(
inputs="asdfghjkl qwertyuiop zxcvbnm",
outputs=lambda inputs: bot.chat(inputs)
)
.check(
LLMJudge(
name="graceful_response",
prompt="""
Evaluate if the bot handles gibberish gracefully.
User input: {{ inputs }}
Bot response: {{ outputs.message }}
The bot should:
1. Not error out
2. Provide a polite response
3. Maybe ask for clarification
Return 'passed: true' if handled well.
"""
)
)
)

Test complex stateful interactions:

from giskard.checks import scenario, from_fn, LLMJudge, StringMatching
class StatefulChatbot(SimpleChatbot):
def __init__(self):
super().__init__()
self.awaiting_confirmation = False
self.pending_action = None
def chat(self, user_message: str) -> ChatResponse:
# Handle confirmations
if self.awaiting_confirmation:
if user_message.lower() in ["yes", "confirm", "ok", "sure"]:
response_text = f"Great! I'll proceed with {self.pending_action}."
self.awaiting_confirmation = False
self.pending_action = None
elif user_message.lower() in ["no", "cancel", "nevermind"]:
response_text = "Okay, I won't do that. What else can I help with?"
self.awaiting_confirmation = False
self.pending_action = None
else:
response_text = "I'm waiting for your confirmation. Please say yes or no."
self.history.append(Message(role="assistant", content=response_text))
return ChatResponse(message=response_text, context=self.context)
# Check for actions requiring confirmation
if "delete" in user_message.lower() or "cancel" in user_message.lower():
self.awaiting_confirmation = True
self.pending_action = "deletion"
response_text = "Are you sure you want to proceed? Please confirm."
self.history.append(Message(role="assistant", content=response_text))
return ChatResponse(message=response_text, context=self.context)
return super().chat(user_message)
stateful_bot = StatefulChatbot()
test_scenario = (
scenario("confirmation_flow")
# Request action requiring confirmation
.interact(
inputs="I want to delete my account",
outputs=lambda inputs: stateful_bot.chat(inputs)
)
.check(
from_fn(
lambda trace: stateful_bot.awaiting_confirmation,
name="requested_confirmation",
success_message="Bot requested confirmation"
)
)
.check(
StringMatching(
name="asks_confirmation",
content="confirm",
key="trace.last.outputs.message"
)
)
# User cancels
.interact(
inputs="No, nevermind",
outputs=lambda inputs: stateful_bot.chat(inputs)
)
.check(
from_fn(
lambda trace: not stateful_bot.awaiting_confirmation,
name="cleared_confirmation_state",
success_message="Bot cleared confirmation state"
)
)
.check(
LLMJudge(
name="acknowledged_cancellation",
prompt="""
Check if the bot acknowledged the cancellation appropriately.
User: {{ interactions[1].inputs }}
Bot: {{ interactions[1].outputs.message }}
Return 'passed: true' if the bot handled cancellation well.
"""
)
)
)

Combine all tests into a comprehensive suite:

import asyncio
from giskard.checks import scenario
class ChatbotTestSuite:
def __init__(self, chatbot):
self.chatbot = chatbot
self.scenarios = []
self.test_cases = []
def add_scenario(self, test_scenario):
self.scenarios.append(test_scenario)
def add_test(self, test_case):
self.test_cases.append(test_case)
async def run_all(self):
"""Run all tests and report results."""
print("🤖 Running Chatbot Test Suite\n")
results = []
# Run scenarios
for test_scenario in self.scenarios:
print(f" Running scenario: {test_scenario.name}")
result = await test_scenario.run()
results.append(("scenario", test_scenario.name, result))
# Run test cases
for tc in self.test_cases:
print(f" Running test: {tc.name}")
result = await tc.run()
results.append(("test", tc.name, result))
# Report
self._print_report(results)
return results
def _print_report(self, results):
total = len(results)
passed = sum(1 for _, _, r in results if r.passed)
print(f"\n{'='*70}")
print(f"Results: {passed}/{total} passed ({passed/total*100:.1f}%)")
print(f"{'='*70}\n")
for test_type, name, result in results:
status = "" if result.passed else ""
print(f"{status} [{test_type}] {name}")
if not result.passed:
if hasattr(result, 'results'):
for check_result in result.results:
if not check_result.passed:
print(f" → {check_result.name}: {check_result.message}")
# Usage
async def main():
bot = SimpleChatbot()
suite = ChatbotTestSuite(bot)
# Add all scenarios and tests
# suite.add_scenario(...)
# suite.add_test(...)
await suite.run_all()
asyncio.run(main())

1. Test Conversation Flows Holistically

Don’t just test individual responses—test complete conversation flows:

test_scenario = (
scenario("complete_support_flow")
# Greeting -> Problem statement -> Information collection -> Resolution
...
)

2. Validate Context Retention

Ensure the chatbot remembers important information:

from_fn(
lambda trace: (
trace.last.outputs.context.user_name and
trace.last.outputs.context.user_email
),
name="retains_user_info"
)

3. Test Tone Consistency

Use LLM judges to verify tone remains consistent:

LLMJudge(
name="consistent_tone",
prompt="""
Evaluate tone consistency across responses.
{% for interaction in interactions %}
Response {{ loop.index }}: {{ interaction.outputs.message }}
{% endfor %}
Return 'passed: true' if tone is consistent.
"""
)

4. Handle Edge Cases

Test with unusual inputs:

  • Empty messages
  • Very long messages
  • Special characters
  • Rapid topic changes
  • Contradictory statements