Skip to content
GitHubDiscord

Testing Utilities

Testing utilities, test runners, and debugging helpers.


Bundle a trace with a set of checks to execute.

Module: giskard.checks.core.testcase

TestCase combines a trace (with pre-recorded interactions) and a sequence of checks to run against that trace. This is useful for testing against fixed interaction sequences or replaying recorded conversations.

AttributeTypeDescription
namestr | NoneOptional label for the test case
traceTraceThe trace containing interactions to test against
checksSequence[Check]Sequence of checks to run against the trace
from giskard.checks import TestCase, Trace, Interaction, Equals
# Create a trace with interactions
trace = Trace(interactions=[
Interaction(inputs="Hello", outputs="Hi there!"),
Interaction(inputs="How are you?", outputs="I'm doing well!"),
])
# Create test case with checks
test_case = TestCase(
name="greeting_test",
trace=trace,
checks=[
Equals(expected="Hi there!", key="trace.interactions[0].outputs"),
Equals(expected="I'm doing well!", key="trace.interactions[1].outputs"),
]
)
# Run the test case
result = await test_case.run()

run(return_exception=False) -> TestCaseResult

Section titled “run(return_exception=False) -> TestCaseResult”

Execute all checks against the trace.

Parameters:

  • return_exception (bool): If True, return results even when exceptions occur instead of raising

Returns: TestCaseResult with check outcomes

result = await test_case.run()
print(f"Status: {result.status}")
print(f"Passed: {result.passed}")
print(f"Failed: {result.failed}")

Run the test case and assert that it passed.

Raises:

  • AssertionError: If the test case did not pass, with formatted failure messages
# Use in tests - raises if any check fails
await test_case.assert_passed()
# Equivalent to:
result = await test_case.run()
result.assert_passed()

Result of test case execution with check outcomes.

Module: giskard.checks.core.result

AttributeTypeDescription
statusCheckStatusOverall test case status (PASS/FAIL/ERROR)
traceTraceThe trace that was tested
check_resultslist[CheckResult]Results from all checks
passedboolTrue if all checks passed
failedboolTrue if any check failed
messagestr | NoneOptional summary message
detailsdict[str, Any]Additional execution details
result = await test_case.run()
# Check overall status
if result.passed:
print("✓ All checks passed!")
else:
print(f"✗ Test failed: {result.message}")
# Review individual check results
for i, check_result in enumerate(result.check_results):
status_icon = "" if check_result.passed else ""
print(f"{status_icon} Check {i}: {check_result.message}")
# Assert passed (raises if failed)
result.assert_passed()

Low-level runner for executing test cases.

Module: giskard.checks.testing.runner

TestCaseRunner provides the execution engine for running test cases. Most users should use test_case.run() rather than using TestCaseRunner directly.

run(test_case, return_exception=False) -> TestCaseResult

Section titled “run(test_case, return_exception=False) -> TestCaseResult”

Execute a test case’s checks against its trace.

Parameters:

  • test_case (TestCase): The test case to execute
  • return_exception (bool): If True, return results even when exceptions occur

Returns: TestCaseResult with check outcomes

from giskard.checks.testing.runner import TestCaseRunner
runner = TestCaseRunner()
result = await runner.run(test_case)

Get the default process-wide TestCaseRunner instance.

Module: giskard.checks.testing.runner

from giskard.checks.testing.runner import get_runner
runner = get_runner()
result = await runner.run(test_case)

Spy on function calls during interaction generation for debugging.

Module: giskard.checks.testing.spy

WithSpy wraps an interaction spec and records all function calls made during interaction generation. This is useful for debugging complex interaction specs or understanding what’s happening during scenario execution.

AttributeTypeDescription
interaction_generatorBaseInteractionSpecThe interaction spec to spy on
targetstrJSONPath to the value to spy on
from giskard.checks import scenario, InteractionSpec
from giskard.checks.testing.spy import WithSpy
# Create an interaction spec to spy on
interaction_spec = InteractionSpec(
inputs=lambda trace: f"Context: {trace.last.outputs if trace.last else 'None'}",
outputs=lambda inputs: my_model(inputs)
)
# Wrap with spy
spied_spec = WithSpy(
interaction_generator=interaction_spec,
target="trace.last.outputs"
)
# Use in scenario
test_scenario = (
scenario("debug_test")
.add_spec(spied_spec)
)
result = await test_scenario.run()
# Access spy data
print("Function calls recorded:")
print(result.details.get("spy_data"))
from giskard.checks import scenario, from_fn
from giskard.checks.testing.spy import WithSpy
# Spy on a complex callable
def complex_output_generator(inputs):
# Complex logic
processed = process_input(inputs)
enriched = enrich_data(processed)
return generate_response(enriched)
# Create spied interaction
spied_interaction = WithSpy(
interaction_generator=InteractionSpec(
inputs="test input",
outputs=complex_output_generator
),
target="trace.last.outputs"
)
# Run and debug
result = await scenario("debug").add_spec(spied_interaction).run()

from giskard.checks import TestCase, Trace, Interaction, from_fn
# Load recorded conversation
recorded_interactions = [
Interaction(
inputs="What's the weather?",
outputs="It's sunny and 75°F.",
metadata={"timestamp": "2024-01-01T10:00:00"}
),
Interaction(
inputs="Should I bring an umbrella?",
outputs="No, you won't need one today.",
metadata={"timestamp": "2024-01-01T10:00:15"}
),
]
trace = Trace(interactions=recorded_interactions)
# Test against recorded conversation
test_case = TestCase(
name="weather_conversation_replay",
trace=trace,
checks=[
from_fn(
lambda t: "sunny" in t.interactions[0].outputs.lower(),
name="mentions_weather"
),
from_fn(
lambda t: "no" in t.interactions[1].outputs.lower(),
name="umbrella_not_needed"
),
]
)
await test_case.assert_passed()
# Run multiple test cases
test_cases = [
TestCase(name="test1", trace=trace1, checks=checks1),
TestCase(name="test2", trace=trace2, checks=checks2),
TestCase(name="test3", trace=trace3, checks=checks3),
]
# Run all in parallel
results = await asyncio.gather(*[tc.run() for tc in test_cases])
# Summary
passed = sum(1 for r in results if r.passed)
failed = sum(1 for r in results if r.failed)
print(f"Passed: {passed}/{len(results)}")
print(f"Failed: {failed}/{len(results)}")
import pytest
from giskard.checks import TestCase, Trace, Interaction, Equals
@pytest.mark.asyncio
async def test_greeting_response():
"""Test that greeting responses are polite."""
trace = Trace(interactions=[
Interaction(inputs="Hello", outputs="Hi there! How can I help?")
])
test_case = TestCase(
name="greeting_politeness",
trace=trace,
checks=[
from_fn(
lambda t: any(word in t.last.outputs.lower()
for word in ["hi", "hello", "hey"]),
name="has_greeting"
),
from_fn(
lambda t: "help" in t.last.outputs.lower(),
name="offers_help"
),
]
)
await test_case.assert_passed()
@pytest.mark.asyncio
async def test_error_handling():
"""Test that errors are handled gracefully."""
trace = Trace(interactions=[
Interaction(
inputs="invalid command",
outputs="I don't understand that command.",
metadata={"error_handled": True}
)
])
test_case = TestCase(
name="error_handling",
trace=trace,
checks=[
Equals(expected=True, key="trace.last.metadata.error_handled"),
from_fn(
lambda t: "don't understand" in t.last.outputs.lower(),
name="error_message_present"
),
]
)
await test_case.assert_passed()
import pytest
from giskard.checks import TestCase, Trace, Interaction, Equals
# Test data
test_data = [
("Hello", "Hi there!"),
("Good morning", "Good morning!"),
("Hey", "Hey! How can I help?"),
]
@pytest.mark.asyncio
@pytest.mark.parametrize("greeting_input,expected_output", test_data)
async def test_greeting_variations(greeting_input, expected_output):
"""Test various greeting inputs."""
trace = Trace(interactions=[
Interaction(inputs=greeting_input, outputs=expected_output)
])
test_case = TestCase(
name=f"greeting_{greeting_input}",
trace=trace,
checks=[
Equals(expected=expected_output, key="trace.last.outputs")
]
)
await test_case.assert_passed()