RAG Evaluation

This tutorial shows how to build a comprehensive test suite for a Retrieval-Augmented Generation (RAG) system.

Introduction

We’ll test a RAG system that answers questions by:

Retrieving relevant context from a knowledge base
Generating an answer grounded in that context
Handling out-of-scope questions appropriately

Our test suite will validate:

Retrieval quality: Are the retrieved documents relevant?
Groundedness: Is the answer based on the retrieved context?
Answer quality: Is the answer accurate and complete?
Handling edge cases: Out-of-scope questions, empty queries, etc.

Building the RAG System

First, let’s create a simple RAG system to test:

from typing import List
from pydantic import BaseModel

class Document(BaseModel):
    content: str
    metadata: dict

class RAGResponse(BaseModel):
    question: str
    answer: str
    retrieved_docs: List[Document]
    confidence: float

class SimpleRAG:
    def __init__(self, documents: List[Document]):
        self.documents = documents

    def retrieve(self, query: str, top_k: int = 3) -> List[Document]:
        """Retrieve relevant documents (simplified similarity)."""
        # In practice, use embeddings and vector search
        query_lower = query.lower()
        scored_docs = []

        for doc in self.documents:
            score = sum(
                word in doc.content.lower()
                for word in query_lower.split()
            )
            if score > 0:
                scored_docs.append((score, doc))

        scored_docs.sort(reverse=True, key=lambda x: x[0])
        return [doc for _, doc in scored_docs[:top_k]]

    def generate_answer(
        self,
        question: str,
        context_docs: List[Document]
    ) -> str:
        """Generate answer from context (in practice, use LLM)."""
        if not context_docs:
            return "I don't have enough information to answer that question."

        # Simplified: just return relevant content
        # In practice, use an LLM to synthesize an answer
        context_text = "\n".join(doc.content for doc in context_docs)
        return f"Based on the available information: {context_text[:200]}..."

    def answer(self, question: str) -> RAGResponse:
        """Main RAG pipeline."""
        if not question.strip():
            return RAGResponse(
                question=question,
                answer="Please provide a valid question.",
                retrieved_docs=[],
                confidence=0.0
            )

        # Retrieve
        docs = self.retrieve(question)

        # Generate
        answer = self.generate_answer(question, docs)

        # Estimate confidence based on retrieval quality
        confidence = min(1.0, len(docs) / 3.0)

        return RAGResponse(
            question=question,
            answer=answer,
            retrieved_docs=docs,
            confidence=confidence
        )

Setting Up Test Data

Create a knowledge base for testing:

knowledge_base = [
    Document(
        content="Paris is the capital and largest city of France. It is known for the Eiffel Tower.",
        metadata={"source": "geography", "topic": "France"}
    ),
    Document(
        content="The Eiffel Tower is a wrought-iron lattice tower in Paris. It was completed in 1889.",
        metadata={"source": "landmarks", "topic": "Eiffel Tower"}
    ),
    Document(
        content="France is a country in Western Europe. It has a population of about 67 million.",
        metadata={"source": "geography", "topic": "France"}
    ),
    Document(
        content="Python is a high-level programming language. It was created by Guido van Rossum.",
        metadata={"source": "technology", "topic": "Python"}
    ),
    Document(
        content="Machine learning is a subset of artificial intelligence focused on data-driven learning.",
        metadata={"source": "technology", "topic": "AI"}
    ),
]

rag = SimpleRAG(documents=knowledge_base)

Test 1: Basic Question Answering

Test that the system answers questions correctly:

from giskard.agents.generators import Generator
from giskard.checks import (
    scenario,
    StringMatching,
    Equality,
    from_fn,
    set_default_generator
)

# Configure LLM for checks
set_default_generator(Generator(model="openai/gpt-5-mini"))

async def test_basic_qa():
    tc = (
        scenario("basic_qa_france_capital")
        .interact(
            inputs="What is the capital of France?",
            outputs=lambda inputs: rag.answer(inputs)
        )
        # Check that answer mentions Paris
        .check(
            StringMatching(
                name="mentions_paris",
                content="Paris",
                key="trace.last.outputs.answer"
            )
        )
        # Check that documents were retrieved
        .check(
            from_fn(
                lambda trace: len(trace.last.outputs.retrieved_docs) > 0,
                name="retrieved_documents",
                success_message="Retrieved relevant documents",
                failure_message="No documents retrieved"
            )
        )
        # Check confidence is reasonable
        .check(
            from_fn(
                lambda trace: trace.last.outputs.confidence > 0.5,
                name="confident_answer",
                success_message="High confidence answer",
                failure_message="Low confidence answer"
            )
        )
    )
    result = await tc.run()

    print(f"Test passed: {result.passed}")
    for check_result in result.results:
        print(f"  {check_result.name}: {check_result.status.value}")

# Run the test
import asyncio
asyncio.run(test_basic_qa())

Test 2: Groundedness Check

Verify that answers are grounded in retrieved context:

from giskard.checks import scenario, Groundedness, StringMatching

async def test_groundedness():
    tc = (
        scenario("groundedness_eiffel_tower")
        .interact(
            inputs="When was the Eiffel Tower completed?",
            outputs=lambda inputs: rag.answer(inputs)
        )
        .check(
            Groundedness(
                name="answer_grounded",
                description="Answer should be based on retrieved documents"
            )
        )
        .check(
            StringMatching(
                name="mentions_year",
                content="1889",
                key="trace.last.outputs.answer"
            )
        )
    )
    result = await tc.run()
    assert result.passed

Test 3: Retrieval Quality

Test that the right documents are retrieved:

from giskard.checks import scenario, from_fn

def check_retrieved_topics(trace) -> bool:
    """Verify retrieved docs are about the right topic."""
    docs = trace.last.outputs.retrieved_docs
    topics = [doc.metadata.get("topic") for doc in docs]
    return "Eiffel Tower" in topics or "France" in topics

tc = (
    scenario("retrieval_quality")
    .interact(
        inputs="Tell me about the Eiffel Tower",
        outputs=lambda inputs: rag.answer(inputs)
    )
    .check(
        from_fn(
            lambda trace: len(trace.last.outputs.retrieved_docs) >= 2,
            name="sufficient_context",
            success_message="Retrieved multiple documents",
            failure_message="Not enough documents retrieved"
        )
    )
    .check(
        from_fn(
            check_retrieved_topics,
            name="relevant_topics",
            success_message="Retrieved documents are topically relevant",
            failure_message="Retrieved documents are off-topic"
        )
    )
)

Test 4: Out-of-Scope Questions

Test how the system handles questions it can’t answer:

from giskard.checks import scenario, LLMJudge, from_fn

tc = (
    scenario("out_of_scope_handling")
    .interact(
        inputs="What is the weather in Tokyo today?",
        outputs=lambda inputs: rag.answer(inputs)
    )
    .check(
        from_fn(
            lambda trace: len(trace.last.outputs.retrieved_docs) == 0,
            name="no_irrelevant_docs",
            success_message="Correctly retrieved no documents",
            failure_message="Retrieved documents for out-of-scope question"
        )
    )
    .check(
        LLMJudge(
            name="appropriate_fallback",
            prompt="""
            Evaluate if the system appropriately indicates it cannot answer.

            Question: {{ inputs }}
            Answer: {{ outputs.answer }}

            The answer should politely indicate insufficient information.
            Return 'passed: true' if appropriate, 'passed: false' if it makes up an answer.
            """
        )
    )
)

Test 5: Answer Quality with LLM Judge

Use an LLM to evaluate answer quality comprehensively:

from giskard.checks import scenario, LLMJudge

tc = (
    scenario("comprehensive_quality_check")
    .interact(
        inputs="What is machine learning?",
        outputs=lambda inputs: rag.answer(inputs)
    )
    .check(
        LLMJudge(
            name="answer_quality",
            prompt="""
            Evaluate the answer quality based on these criteria:

            Question: {{ inputs }}
            Answer: {{ outputs.answer }}
            Retrieved Context: {{ outputs.retrieved_docs }}

            Criteria:
            1. Accuracy: Is the answer factually correct?
            2. Completeness: Does it fully address the question?
            3. Clarity: Is it well-written and understandable?
            4. Relevance: Does it stay on topic?

            Return 'passed: true' if the answer meets all criteria.
            Provide brief reasoning.
            """
        )
    )
)

Test 6: Multi-Turn Conversational RAG

Test a conversational RAG that handles follow-up questions:

from giskard.checks import (
    scenario,
    Groundedness,
    from_fn,
    LLMJudge,
    StringMatching
)

class ConversationalRAG(SimpleRAG):
    def __init__(self, documents):
        super().__init__(documents)
        self.conversation_history = []

    def answer(self, question: str) -> RAGResponse:
        # Resolve references using conversation history
        resolved_question = self._resolve_references(
            question,
            self.conversation_history
        )

        response = super().answer(resolved_question)

        self.conversation_history.append({
            "question": question,
            "resolved_question": resolved_question,
            "answer": response.answer
        })

        return response

    def _resolve_references(self, question: str, history: list) -> str:
        """Resolve pronouns and references in follow-up questions."""
        # Simplified: in practice, use LLM for coreference resolution
        if history and ("it" in question.lower() or "its" in question.lower()):
            # Get the topic from previous question
            prev_question = history[-1]["resolved_question"]
            return f"{question} (referring to: {prev_question})"
        return question

conv_rag = ConversationalRAG(documents=knowledge_base)

test_scenario = (
    scenario("conversational_rag_flow")
    # First question
    .interact(
        inputs="What is the capital of France?",
        outputs=lambda inputs: conv_rag.answer(inputs)
    )
    .check(Groundedness(name="first_answer_grounded"))
    .check(
        StringMatching(
            name="first_mentions_paris",
            content="Paris",
            key="trace.last.outputs.answer"
        )
    )

    # Follow-up question with reference
    .interact(
        inputs="What is it known for?",
        outputs=lambda inputs: conv_rag.answer(inputs)
    )
    .check(Groundedness(name="followup_grounded"))
    .check(
        LLMJudge(
            name="resolves_reference",
            prompt="""
            Check if the answer appropriately addresses the follow-up question
            in the context of the conversation.

            First Q: {{ interactions[0].inputs }}
            First A: {{ interactions[0].outputs.answer }}
            Follow-up Q: {{ interactions[1].inputs }}
            Follow-up A: {{ interactions[1].outputs.answer }}

            The follow-up should discuss what Paris is known for.
            Return 'passed: true' if the context was maintained correctly.
            """
        )
    )
)

async def test_conversational_rag():
    result = await test_scenario.run()
    print(f"Conversational RAG test passed: {result.passed}")

Complete Test Suite

Combine all tests into a comprehensive suite:

import asyncio
from typing import List
from giskard.checks import TestCase

class RAGTestSuite:
    def __init__(self, rag_system: SimpleRAG):
        self.rag = rag_system
        self.test_cases = []
        self._build_test_cases()

    def _build_test_cases(self):
        """Build all test cases."""
        # Add basic QA tests
        self.test_cases.extend(self._create_qa_tests())

        # Add groundedness tests
        self.test_cases.extend(self._create_groundedness_tests())

        # Add edge case tests
        self.test_cases.extend(self._create_edge_case_tests())

    def _create_qa_tests(self) -> List[TestCase]:
        """Create basic QA test cases."""
        test_data = [
            ("What is the capital of France?", "Paris"),
            ("When was the Eiffel Tower completed?", "1889"),
            ("What is Python?", "programming language"),
        ]

        tests = []
        for question, expected_content in test_data:
            tc = (
                scenario(f"qa_{expected_content.replace(' ', '_')}")
                .interact(
                    inputs=question,
                    outputs=lambda q: self.rag.answer(q)
                )
                .check(
                    StringMatching(
                        name=f"contains_{expected_content}",
                        content=expected_content,
                        key="trace.last.outputs.answer"
                    )
                )
                .check(
                    from_fn(
                        lambda trace: len(trace.last.outputs.retrieved_docs) > 0,
                        name="has_context"
                    )
                )
            )
            tests.append(tc)

        return tests

    def _create_groundedness_tests(self) -> List[TestCase]:
        """Create groundedness test cases."""
        questions = [
            "What is the capital of France?",
            "Tell me about the Eiffel Tower",
            "What is machine learning?",
        ]

        tests = []
        for question in questions:
            tc = (
                scenario(f"groundedness_{question[:20]}")
                .interact(
                    inputs=question,
                    outputs=lambda q: self.rag.answer(q)
                )
                .check(Groundedness(name="grounded"))
            )
            tests.append(tc)

        return tests

    def _create_edge_case_tests(self) -> List[TestCase]:
        """Create edge case test cases."""
        edge_cases = [
            ("", "empty_query"),
            ("   ", "whitespace_query"),
            ("What is the weather in Tokyo?", "out_of_scope"),
            ("askdjhaksjdhaksjdh", "gibberish"),
        ]

        tests = []
        for question, case_name in edge_cases:
            tc = (
                scenario(f"edge_case_{case_name}")
                .interact(
                    inputs=question,
                    outputs=lambda q: self.rag.answer(q)
                )
                .check(
                    from_fn(
                        lambda trace: trace.last.outputs.answer,
                        name="provides_response",
                        success_message="System provided a response"
                    )
                )
            )
            tests.append(tc)

        return tests

    async def run_all(self):
        """Run all tests and report results."""
        results = []

        for tc in self.test_cases:
            result = await tc.run()
            results.append((tc.name, result))

        # Summary
        passed = sum(1 for _, r in results if r.passed)
        total = len(results)

        print(f"\nTest Suite Results: {passed}/{total} passed ({passed/total*100:.1f}%)")
        print("\nDetailed Results:")

        for name, result in results:
            status = "✓" if result.passed else "✗"
            print(f"  {status} {name}")
            if not result.passed:
                for check_result in result.results:
                    if not check_result.passed:
                        print(f"      - {check_result.name}: {check_result.message}")

        return results

# Run the complete suite
async def main():
    suite = RAGTestSuite(rag)
    await suite.run_all()

asyncio.run(main())

Best Practices for RAG Testing

1. Test Retrieval Separately

Validate retrieval quality before testing end-to-end:

def test_retrieval_precision():
    docs = rag.retrieve("Eiffel Tower")
    relevant_topics = ["Eiffel Tower", "France", "Paris"]
    assert all(
        any(topic in doc.metadata.get("topic", "") for topic in relevant_topics)
        for doc in docs
    )

2. Use Representative Test Data

Include diverse question types:

Factual questions
Definitional questions
Comparison questions
Out-of-scope questions
Ambiguous questions

3. Monitor Confidence Scores

Track confidence metrics to identify problematic queries:

checks = [
    from_fn(
        lambda trace: trace.last.outputs.confidence,
        name="track_confidence",
        success_message=lambda trace: f"Confidence: {trace.last.outputs.confidence}"
    ),
]

4. Test with Real User Queries

Collect and test with actual user questions from logs.

Next Steps

See Testing Agents for agent-specific testing patterns
Explore Chatbot Testing for conversational testing
Review Multi-Turn Scenarios for advanced scenarios