Skip to content
GitHubDiscord

Evaluations

An Evaluation runs an agent against all test cases in a dataset, applies the configured checks to each response, and produces a per-test-case result with a pass/fail verdict.

A remote evaluation calls your registered agent’s HTTP endpoint for every test case in the dataset.

import time
from giskard_hub import HubClient
hub = HubClient()
evaluation = hub.evaluations.create(
project_id="project-id",
agent_id="agent-id",
criteria={"dataset_id": "dataset-id"},
name="v2.1 regression run",
)
print(evaluation.id)
# Poll until complete
while evaluation.status.state == "running":
time.sleep(5)
evaluation = hub.evaluations.retrieve(evaluation.id)
print(f"Evaluation finished with status: {evaluation.status.state}")

Run the evaluation only against test cases with specific tags:

evaluation = hub.evaluations.create(
project_id="project-id",
agent_id="agent-id",
criteria={"dataset_id": "dataset-id", "tags": ["shipping"]},
name="Shipping-only run",
)

Set run_count to run each test case multiple times (useful for measuring consistency across stochastic outputs):

evaluation = hub.evaluations.create(
project_id="project-id",
agent_id="agent-id",
criteria={"dataset_id": "dataset-id"},
run_count=3,
name="Consistency check — 3x",
)

A local evaluation lets you run inference using a Python function in your process rather than an HTTP endpoint. This is ideal for evaluating models during development without exposing an API.

The flow is manual: create the evaluation, fetch the results (which contain the test cases), call your agent for each one, and submit the outputs back to the Hub.

from giskard_hub.types import ChatMessage
def my_agent(messages: list[ChatMessage]) -> ChatMessage:
# Call your local model or chain here
user_input = messages[-1].content
return ChatMessage(
role="assistant",
content=f"Echo: {user_input}" # replace with real inference
)
evaluation = hub.evaluations.create_local(
agent={"name": "my_agent", "description": "A simple echo agent"},
criteria=[{"dataset_id": "dataset-id"}],
name="Local evaluation",
)
results = hub.evaluations.results.list(
evaluation_id=evaluation.id,
include=["test_case"],
)
for result in results:
messages = result.test_case.messages
agent_output = my_agent(messages)
hub.evaluations.results.submit_local_output(
evaluation_id=evaluation.id,
result_id=result.id,
output={"response": agent_output},
)

results = hub.evaluations.results.list("evaluation-id")
for result in results:
print(f"{result.id}: {result.state}")
for check in result.results:
verdict = "" if check.passed else ""
print(f" {verdict} {check.name}")
results_search = hub.evaluations.results.search(
"evaluation-id",
filters={"sample_success": {"selected_options": ["fail"]}},
limit=50,
)
result = hub.evaluations.results.retrieve(
"result-id",
evaluation_id="evaluation-id",
)
print(result.state)

Update the failure category of result (manual review)

Section titled “Update the failure category of result (manual review)”

The full list of available failure categories for a project can be retrieved via hub.projects.retrieve("project-id").failure_categories.

hub.evaluations.results.update(
"result-id",
evaluation_id="evaluation-id",
failure_category={
"identifier": "contradiction",
"title": "Contradiction",
"description": "The agent incorrectly provides an answer that contradicts the information given in the context (for groundedness checks) or in the reference (for correctness checks)."
}
)

You can hide individual results from the default view (for example, noisy outliers):

hub.evaluations.results.update_visibility(
"result-id",
evaluation_id="evaluation-id",
hidden=True,
)

If some test cases failed due to transient agent errors (timeouts, 5xx responses), rerun only the errored ones without triggering a full re-evaluation:

hub.evaluations.rerun_errored_results("evaluation-id")

Rerun a single specific result:

hub.evaluations.results.rerun_test_case("result-id", evaluation_id="evaluation-id")

Use evaluations as a quality gate in your CI/CD pipeline. Exit with a non-zero code if any metric falls below your threshold:

import os
import sys
import time
from giskard_hub import HubClient
hub = HubClient()
evaluation = hub.evaluations.create(
project_id="project-id",
agent_id="agent-id",
criteria={"dataset_id": "dataset-id"},
name=f"CI run — {os.environ.get('CI_COMMIT_SHA', 'local')}",
)
while evaluation.status.state == "running":
time.sleep(10)
evaluation = hub.evaluations.retrieve(evaluation.id)
if evaluation.status.state == "error":
print("Evaluation encountered errors.")
sys.exit(1)
global_metrics = [m for m in evaluation.metrics if m.name == "global"][0]
pass_rate = global_metrics.passed / global_metrics.total * 100
print(f"Pass rate: {pass_rate:.2f}% ({global_metrics.passed}/{global_metrics.total})")
THRESHOLD = 90.0
if pass_rate < THRESHOLD:
print(f"Quality gate failed: pass rate {pass_rate:.1f}% < {THRESHOLD}%")
sys.exit(1)
print("Quality gate passed.")

You can evaluate a single (input, output) pair against a set of checks without running a full evaluation. This is useful for debugging or CI gates on individual responses:

from giskard_hub.types import ChatMessage
results = hub.evaluations.run_single(
project_id="project-id",
agent_output={"response": ChatMessage(role="assistant", content="You can return anything within 30 days.")},
messages=[{"role": "user", "content": "What is your return policy?"}],
checks=[
{"identifier": "tone_professional"},
],
)
for check in results:
print(f"{check.name}: {'passed' if check.passed else 'failed'}")

evaluations = hub.evaluations.list(project_id="project-id")
hub.evaluations.update("evaluation-id", name="Renamed evaluation")
hub.evaluations.delete("evaluation-id")

Scheduled Evaluations automatically run an evaluation on a regular cadence — daily, weekly, or monthly. They’re the foundation of continuous quality monitoring: set them up once and the Hub will run them automatically, so you catch regressions without any manual effort.

schedule = hub.scheduled_evaluations.create(
project_id="project-id",
agent_id="agent-id",
dataset_id="dataset-id",
name="Weekly regression check",
frequency="weekly",
time="09:00", # UTC time of day
day_of_week=1, # 1 = Monday, 7 = Sunday
)
print(f"Scheduled evaluation created: {schedule.id}")
frequencyDescriptionRequired extra params
"daily"Runs every day at the specified timetime
"weekly"Runs once a weektime, day_of_week (1–7)
"monthly"Runs once a monthtime, day_of_month (1–28)
# Daily at 06:00 UTC
hub.scheduled_evaluations.create(
project_id="project-id",
agent_id="agent-id",
dataset_id="dataset-id",
name="Daily smoke test",
frequency="daily",
time="06:00",
)
# Monthly on the 1st at 08:00 UTC
hub.scheduled_evaluations.create(
project_id="project-id",
agent_id="agent-id",
dataset_id="dataset-id",
name="Monthly full regression",
frequency="monthly",
time="08:00",
day_of_month=1,
)
schedules = hub.scheduled_evaluations.list(project_id="project-id")
for s in schedules:
print(f"{s.name}{s.frequency} — last execution: {s.last_execution_at}")
scheduled_evaluation = hub.scheduled_evaluations.retrieve(
"scheduled-evaluation-id",
include=["evaluations"],
)
print(f"Schedule: {scheduled_evaluation.name}")
for evaluation in scheduled_evaluation.evaluations:
print(f" Run {evaluation.id}: {evaluation.status.state} at {evaluation.created_at}")
evaluation_runs = hub.scheduled_evaluations.list_evaluations(
"scheduled-evaluation-id",
)
for run in evaluation_runs:
print(f"Run: {run.id}{run.status.state}{run.created_at}")
hub.scheduled_evaluations.update(
"scheduled-evaluation-id",
name="Updated schedule name",
frequency="daily",
time="07:30",
)
hub.scheduled_evaluations.delete("scheduled-evaluation-id")