RAG Evaluation
This tutorial shows how to build a comprehensive test suite for a Retrieval-Augmented Generation (RAG) system.
Introduction
Section titled “Introduction”We’ll test a RAG system that answers questions by:
- Retrieving relevant context from a knowledge base
- Generating an answer grounded in that context
- Handling out-of-scope questions appropriately
Our test suite will validate:
- Retrieval quality: Are the retrieved documents relevant?
- Groundedness: Is the answer based on the retrieved context?
- Answer quality: Is the answer accurate and complete?
- Handling edge cases: Out-of-scope questions, empty queries, etc.
Building the RAG System
Section titled “Building the RAG System”First, let’s create a simple RAG system to test:
from typing import Listfrom pydantic import BaseModel
class Document(BaseModel): content: str metadata: dict
class RAGResponse(BaseModel): question: str answer: str retrieved_docs: List[Document] confidence: float
class SimpleRAG: def __init__(self, documents: List[Document]): self.documents = documents
def retrieve(self, query: str, top_k: int = 3) -> List[Document]: """Retrieve relevant documents (simplified similarity).""" # In practice, use embeddings and vector search query_lower = query.lower() scored_docs = []
for doc in self.documents: score = sum( word in doc.content.lower() for word in query_lower.split() ) if score > 0: scored_docs.append((score, doc))
scored_docs.sort(reverse=True, key=lambda x: x[0]) return [doc for _, doc in scored_docs[:top_k]]
def generate_answer( self, question: str, context_docs: List[Document] ) -> str: """Generate answer from context (in practice, use LLM).""" if not context_docs: return "I don't have enough information to answer that question."
# Simplified: just return relevant content # In practice, use an LLM to synthesize an answer context_text = "\n".join(doc.content for doc in context_docs) return f"Based on the available information: {context_text[:200]}..."
def answer(self, question: str) -> RAGResponse: """Main RAG pipeline.""" if not question.strip(): return RAGResponse( question=question, answer="Please provide a valid question.", retrieved_docs=[], confidence=0.0 )
# Retrieve docs = self.retrieve(question)
# Generate answer = self.generate_answer(question, docs)
# Estimate confidence based on retrieval quality confidence = min(1.0, len(docs) / 3.0)
return RAGResponse( question=question, answer=answer, retrieved_docs=docs, confidence=confidence )Setting Up Test Data
Section titled “Setting Up Test Data”Create a knowledge base for testing:
knowledge_base = [ Document( content="Paris is the capital and largest city of France. It is known for the Eiffel Tower.", metadata={"source": "geography", "topic": "France"} ), Document( content="The Eiffel Tower is a wrought-iron lattice tower in Paris. It was completed in 1889.", metadata={"source": "landmarks", "topic": "Eiffel Tower"} ), Document( content="France is a country in Western Europe. It has a population of about 67 million.", metadata={"source": "geography", "topic": "France"} ), Document( content="Python is a high-level programming language. It was created by Guido van Rossum.", metadata={"source": "technology", "topic": "Python"} ), Document( content="Machine learning is a subset of artificial intelligence focused on data-driven learning.", metadata={"source": "technology", "topic": "AI"} ),]
rag = SimpleRAG(documents=knowledge_base)Test 1: Basic Question Answering
Section titled “Test 1: Basic Question Answering”Test that the system answers questions correctly:
from giskard.agents.generators import Generatorfrom giskard.checks import ( scenario, StringMatching, Equality, from_fn, set_default_generator)
# Configure LLM for checksset_default_generator(Generator(model="openai/gpt-5-mini"))
async def test_basic_qa(): tc = ( scenario("basic_qa_france_capital") .interact( inputs="What is the capital of France?", outputs=lambda inputs: rag.answer(inputs) ) # Check that answer mentions Paris .check( StringMatching( name="mentions_paris", content="Paris", key="trace.last.outputs.answer" ) ) # Check that documents were retrieved .check( from_fn( lambda trace: len(trace.last.outputs.retrieved_docs) > 0, name="retrieved_documents", success_message="Retrieved relevant documents", failure_message="No documents retrieved" ) ) # Check confidence is reasonable .check( from_fn( lambda trace: trace.last.outputs.confidence > 0.5, name="confident_answer", success_message="High confidence answer", failure_message="Low confidence answer" ) ) ) result = await tc.run()
print(f"Test passed: {result.passed}") for check_result in result.results: print(f" {check_result.name}: {check_result.status.value}")
# Run the testimport asyncioasyncio.run(test_basic_qa())Test 2: Groundedness Check
Section titled “Test 2: Groundedness Check”Verify that answers are grounded in retrieved context:
from giskard.checks import scenario, Groundedness, StringMatching
async def test_groundedness(): tc = ( scenario("groundedness_eiffel_tower") .interact( inputs="When was the Eiffel Tower completed?", outputs=lambda inputs: rag.answer(inputs) ) .check( Groundedness( name="answer_grounded", description="Answer should be based on retrieved documents" ) ) .check( StringMatching( name="mentions_year", content="1889", key="trace.last.outputs.answer" ) ) ) result = await tc.run() assert result.passedTest 3: Retrieval Quality
Section titled “Test 3: Retrieval Quality”Test that the right documents are retrieved:
from giskard.checks import scenario, from_fn
def check_retrieved_topics(trace) -> bool: """Verify retrieved docs are about the right topic.""" docs = trace.last.outputs.retrieved_docs topics = [doc.metadata.get("topic") for doc in docs] return "Eiffel Tower" in topics or "France" in topics
tc = ( scenario("retrieval_quality") .interact( inputs="Tell me about the Eiffel Tower", outputs=lambda inputs: rag.answer(inputs) ) .check( from_fn( lambda trace: len(trace.last.outputs.retrieved_docs) >= 2, name="sufficient_context", success_message="Retrieved multiple documents", failure_message="Not enough documents retrieved" ) ) .check( from_fn( check_retrieved_topics, name="relevant_topics", success_message="Retrieved documents are topically relevant", failure_message="Retrieved documents are off-topic" ) ))Test 4: Out-of-Scope Questions
Section titled “Test 4: Out-of-Scope Questions”Test how the system handles questions it can’t answer:
from giskard.checks import scenario, LLMJudge, from_fn
tc = ( scenario("out_of_scope_handling") .interact( inputs="What is the weather in Tokyo today?", outputs=lambda inputs: rag.answer(inputs) ) .check( from_fn( lambda trace: len(trace.last.outputs.retrieved_docs) == 0, name="no_irrelevant_docs", success_message="Correctly retrieved no documents", failure_message="Retrieved documents for out-of-scope question" ) ) .check( LLMJudge( name="appropriate_fallback", prompt=""" Evaluate if the system appropriately indicates it cannot answer.
Question: {{ inputs }} Answer: {{ outputs.answer }}
The answer should politely indicate insufficient information. Return 'passed: true' if appropriate, 'passed: false' if it makes up an answer. """ ) ))Test 5: Answer Quality with LLM Judge
Section titled “Test 5: Answer Quality with LLM Judge”Use an LLM to evaluate answer quality comprehensively:
from giskard.checks import scenario, LLMJudge
tc = ( scenario("comprehensive_quality_check") .interact( inputs="What is machine learning?", outputs=lambda inputs: rag.answer(inputs) ) .check( LLMJudge( name="answer_quality", prompt=""" Evaluate the answer quality based on these criteria:
Question: {{ inputs }} Answer: {{ outputs.answer }} Retrieved Context: {{ outputs.retrieved_docs }}
Criteria: 1. Accuracy: Is the answer factually correct? 2. Completeness: Does it fully address the question? 3. Clarity: Is it well-written and understandable? 4. Relevance: Does it stay on topic?
Return 'passed: true' if the answer meets all criteria. Provide brief reasoning. """ ) ))Test 6: Multi-Turn Conversational RAG
Section titled “Test 6: Multi-Turn Conversational RAG”Test a conversational RAG that handles follow-up questions:
from giskard.checks import ( scenario, Groundedness, from_fn, LLMJudge, StringMatching)
class ConversationalRAG(SimpleRAG): def __init__(self, documents): super().__init__(documents) self.conversation_history = []
def answer(self, question: str) -> RAGResponse: # Resolve references using conversation history resolved_question = self._resolve_references( question, self.conversation_history )
response = super().answer(resolved_question)
self.conversation_history.append({ "question": question, "resolved_question": resolved_question, "answer": response.answer })
return response
def _resolve_references(self, question: str, history: list) -> str: """Resolve pronouns and references in follow-up questions.""" # Simplified: in practice, use LLM for coreference resolution if history and ("it" in question.lower() or "its" in question.lower()): # Get the topic from previous question prev_question = history[-1]["resolved_question"] return f"{question} (referring to: {prev_question})" return question
conv_rag = ConversationalRAG(documents=knowledge_base)
test_scenario = ( scenario("conversational_rag_flow") # First question .interact( inputs="What is the capital of France?", outputs=lambda inputs: conv_rag.answer(inputs) ) .check(Groundedness(name="first_answer_grounded")) .check( StringMatching( name="first_mentions_paris", content="Paris", key="trace.last.outputs.answer" ) )
# Follow-up question with reference .interact( inputs="What is it known for?", outputs=lambda inputs: conv_rag.answer(inputs) ) .check(Groundedness(name="followup_grounded")) .check( LLMJudge( name="resolves_reference", prompt=""" Check if the answer appropriately addresses the follow-up question in the context of the conversation.
First Q: {{ interactions[0].inputs }} First A: {{ interactions[0].outputs.answer }} Follow-up Q: {{ interactions[1].inputs }} Follow-up A: {{ interactions[1].outputs.answer }}
The follow-up should discuss what Paris is known for. Return 'passed: true' if the context was maintained correctly. """ ) ))
async def test_conversational_rag(): result = await test_scenario.run() print(f"Conversational RAG test passed: {result.passed}")Complete Test Suite
Section titled “Complete Test Suite”Combine all tests into a comprehensive suite:
import asynciofrom typing import Listfrom giskard.checks import TestCase
class RAGTestSuite: def __init__(self, rag_system: SimpleRAG): self.rag = rag_system self.test_cases = [] self._build_test_cases()
def _build_test_cases(self): """Build all test cases.""" # Add basic QA tests self.test_cases.extend(self._create_qa_tests())
# Add groundedness tests self.test_cases.extend(self._create_groundedness_tests())
# Add edge case tests self.test_cases.extend(self._create_edge_case_tests())
def _create_qa_tests(self) -> List[TestCase]: """Create basic QA test cases.""" test_data = [ ("What is the capital of France?", "Paris"), ("When was the Eiffel Tower completed?", "1889"), ("What is Python?", "programming language"), ]
tests = [] for question, expected_content in test_data: tc = ( scenario(f"qa_{expected_content.replace(' ', '_')}") .interact( inputs=question, outputs=lambda q: self.rag.answer(q) ) .check( StringMatching( name=f"contains_{expected_content}", content=expected_content, key="trace.last.outputs.answer" ) ) .check( from_fn( lambda trace: len(trace.last.outputs.retrieved_docs) > 0, name="has_context" ) ) ) tests.append(tc)
return tests
def _create_groundedness_tests(self) -> List[TestCase]: """Create groundedness test cases.""" questions = [ "What is the capital of France?", "Tell me about the Eiffel Tower", "What is machine learning?", ]
tests = [] for question in questions: tc = ( scenario(f"groundedness_{question[:20]}") .interact( inputs=question, outputs=lambda q: self.rag.answer(q) ) .check(Groundedness(name="grounded")) ) tests.append(tc)
return tests
def _create_edge_case_tests(self) -> List[TestCase]: """Create edge case test cases.""" edge_cases = [ ("", "empty_query"), (" ", "whitespace_query"), ("What is the weather in Tokyo?", "out_of_scope"), ("askdjhaksjdhaksjdh", "gibberish"), ]
tests = [] for question, case_name in edge_cases: tc = ( scenario(f"edge_case_{case_name}") .interact( inputs=question, outputs=lambda q: self.rag.answer(q) ) .check( from_fn( lambda trace: trace.last.outputs.answer, name="provides_response", success_message="System provided a response" ) ) ) tests.append(tc)
return tests
async def run_all(self): """Run all tests and report results.""" results = []
for tc in self.test_cases: result = await tc.run() results.append((tc.name, result))
# Summary passed = sum(1 for _, r in results if r.passed) total = len(results)
print(f"\nTest Suite Results: {passed}/{total} passed ({passed/total*100:.1f}%)") print("\nDetailed Results:")
for name, result in results: status = "✓" if result.passed else "✗" print(f" {status} {name}") if not result.passed: for check_result in result.results: if not check_result.passed: print(f" - {check_result.name}: {check_result.message}")
return results
# Run the complete suiteasync def main(): suite = RAGTestSuite(rag) await suite.run_all()
asyncio.run(main())Best Practices for RAG Testing
Section titled “Best Practices for RAG Testing”1. Test Retrieval Separately
Validate retrieval quality before testing end-to-end:
def test_retrieval_precision(): docs = rag.retrieve("Eiffel Tower") relevant_topics = ["Eiffel Tower", "France", "Paris"] assert all( any(topic in doc.metadata.get("topic", "") for topic in relevant_topics) for doc in docs )2. Use Representative Test Data
Include diverse question types:
- Factual questions
- Definitional questions
- Comparison questions
- Out-of-scope questions
- Ambiguous questions
3. Monitor Confidence Scores
Track confidence metrics to identify problematic queries:
checks = [ from_fn( lambda trace: trace.last.outputs.confidence, name="track_confidence", success_message=lambda trace: f"Confidence: {trace.last.outputs.confidence}" ),]4. Test with Real User Queries
Collect and test with actual user questions from logs.
Next Steps
Section titled “Next Steps”- See Testing Agents for agent-specific testing patterns
- Explore Chatbot Testing for conversational testing
- Review Multi-Turn Scenarios for advanced scenarios