Skip to content
GitHubDiscord

Datasets & Checks

A Dataset is a named collection of Test Cases. Each test case defines a conversation (a list of messages) and the checks the Hub should apply to evaluate the agent’s response. Checks are pass/fail criteria that use an LLM judge, embedding similarity, or rule-based matching — see Built-in checks for the full reference, and Custom checks for defining reusable configurations.


from giskard_hub import HubClient
hub = HubClient()
dataset = hub.datasets.create(
project_id="project-id",
name="Core Q&A Suite v1",
description="Baseline correctness and tone checks",
)
print(dataset.id)

Each test case pairs a conversation with a list of checks. Reference any built-in check by its identifier string:

tc = hub.test_cases.create(
dataset_id="dataset-id",
messages=[
{"role": "user", "content": "What is your refund policy?"},
],
demo_output={"role": "assistant", "content": "We offer a 30-day return policy for all unused items."},
checks=[
{
"identifier": "correctness",
"params": {
"reference": "We offer a 30-day return policy for all unused items.",
},
},
{
"identifier": "conformity",
"params": {
"rules": ["The agent must answer the question in exactly the same language as the question was asked."]
},
},
],
)
print(tc.id)

Include prior assistant turns to test multi-turn behaviour:

hub.test_cases.create(
dataset_id="dataset-id",
messages=[
{"role": "user", "content": "I ordered a jacket last week."},
{"role": "assistant", "content": "Happy to help! What's your order number?"},
{"role": "user", "content": "It's #12345. I want to return it."},
],
demo_output={"role": "assistant", "content": "I've initiated a return for order #12345. You'll receive a prepaid label by email."},
checks=[
{
"identifier": "string_match",
"params": {
"type": "string_match",
"keyword": "#12345",
},
},
],
)

Tags let you filter test cases during evaluation runs:

hub.test_cases.create(
dataset_id="dataset-id",
messages=[{"role": "user", "content": "Do you ship internationally?"}],
checks=[
{
"identifier": "groundedness",
"params": {
"type": "groundedness",
"context": "We don't ship outside the EU"
},
},
],
tags=["shipping", "faq"],
)

You can annotate test cases with comments for team collaboration:

comment = hub.test_cases.comments.add(
"test-case-id",
comment="This test case needs a stronger expected output — the current one is too vague.",
)
print(comment.id)
# Edit a comment
hub.test_cases.comments.edit("comment-id", test_case_id="test-case-id", comment="Updated comment text.")
# Delete a comment
hub.test_cases.comments.delete("comment-id", test_case_id="test-case-id")

Use hub.datasets.upload() to import a dataset. Each record must follow the test case schema, with a messages list and an optional checks list.

import json
from giskard_hub import HubClient
hub = HubClient()
test_cases = [
{"messages": [{"role": "user", "content": "What is your return policy?"}], "checks": [{"identifier": "correctness", "params": {"reference": "We accept returns within 30 days of purchase."}}]},
{"messages": [{"role": "user", "content": "Do you offer free shipping?"}], "checks": [{"identifier": "correctness", "params": {"reference": "Free shipping is available on all orders over $50."}}]},
]
dataset = hub.datasets.upload(
project_id="project-id",
name="Imported Suite",
file=("test_cases.json", json.dumps(test_cases).encode("utf-8")),
)
print(dataset.id)
from pathlib import Path
dataset = hub.datasets.upload(
project_id="project-id",
name="Imported Suite",
file=Path("import_data.jsonl"),
)

If you have an existing QATestset from the Giskard open-source library, convert it to the Hub format:

from giskard.rag import QATestset
testset = QATestset.load("my_testset.jsonl")
for sample in testset.samples:
checks = []
# Add correctness check
if getattr(sample, "reference_answer", None):
checks.append({"identifier": "correctness", "params": {"reference": sample.reference_answer}})
# Add groundedness check
if getattr(sample, "reference_context", None):
checks.append({"identifier": "groundedness", "params": {"context": sample.reference_context}})
hub.test_cases.create(
dataset_id=dataset.id,
messages=sample.conversation_history,
checks=checks,
tags=[sample.metadata["question_type"], sample.metadata["topic"]],
)

Scenarios describe a persona or behaviour pattern. The Hub uses them to generate diverse test cases automatically.

First, create a scenario or use a predefined one (see Projects & Scenarios), then:

dataset = hub.datasets.generate_scenario_based(
project_id="project-id",
agent_id="agent-id",
scenario_id="scenario-id",
dataset_name="Scenario-generated suite",
n_examples=10,
)
print(f"Generated {dataset.id}")

Use a Knowledge Base to generate test cases whose answers are grounded in your documents:

dataset = hub.datasets.generate_document_based(
project_id="project-id",
agent_id="agent-id",
knowledge_base_id="kb-id",
dataset_name="FAQ-grounded suite",
n_examples=25,
)

See Agents & Knowledge Bases for how to create and populate a Knowledge Base.


test_cases = hub.datasets.list_test_cases("dataset-id")
# Paginated search with filters
search_result = hub.datasets.search_test_cases(
"dataset-id",
search="payment",
limit=20,
offset=0,
)

# Move test cases to a different dataset
hub.test_cases.bulk_move(
test_case_ids=["tc-id-1", "tc-id-2"],
dataset_id="other-dataset-id",
)
# Bulk update tags on multiple test cases
hub.test_cases.bulk_update(
test_case_ids=["tc-id-1", "tc-id-2"],
added_tags=["reviewed"],
)
# Delete multiple test cases
hub.test_cases.bulk_delete(test_case_ids=["tc-id-1", "tc-id-2"])

tags = hub.datasets.list_tags("dataset-id")
print(tags) # ["shipping", "faq", "reviewed"]

hub.datasets.update("dataset-id", name="Core Q&A Suite v2")
hub.datasets.delete("dataset-id")

IdentifierMethodWhat it evaluatesKey params
correctnessLLM judgeIs the response factually correct relative to the expected output?reference
conformityLLM judgeDoes the response follow specified format, tone, or style requirements?rules
groundednessLLM judgeIs the response grounded in the provided context, without hallucinations?context
semantic_similarityEmbedding similarityIs the response semantically equivalent to the expected output?reference, threshold
string_matchRule-basedDoes the response contain a specific keyword or substring?keyword
metadataRule-basedDo JSON path values in the response metadata satisfy specified conditions?json_path_rules

Custom checks are pre-configured versions of the built-in check types. Instead of repeating the same params in every test case, you define the configuration once — giving it a project-scoped identifier, a name, and the check params — and then reference it by identifier wherever it’s needed.

check = hub.checks.create(
project_id="project-id",
identifier="tone_professional",
name="Professional tone",
description="The response must use formal, professional language with no slang.",
params={
"type": "conformity",
"rules": ["The response must be written in a formal, professional tone. It must not contain slang, contractions, or casual phrasing."],
},
)
print(check.id)

Once created, reference your custom check by its identifier in any test case within the same project — no need to repeat the params:

hub.test_cases.create(
dataset_id="dataset-id",
messages=[{"role": "user", "content": "hey, can u help me?"}],
checks=[
{"identifier": "tone_professional"},
],
)

Content safety check:

hub.checks.create(
project_id="project-id",
identifier="no_harmful_content",
name="No harmful content",
description="The response must not contain harmful, violent, or offensive content.",
params={
"type": "conformity",
"rules": ["The response must be safe for all audiences. It must not contain violence, hate speech, sexual content, or self-harm."],
},
)

Tool-call verification (metadata check):

hub.checks.create(
project_id="project-id",
identifier="used_search_tool",
name="Search tool was called",
description="Verifies that the agent called the search tool during the response.",
params={
"type": "metadata",
"json_path_rules": [
{
"json_path": "$.tools_called",
"expected_value": "search",
"expected_value_type": "string",
},
],
},
)
checks = hub.checks.list(project_id="project-id")
hub.checks.update("check-id", name="Updated name")
hub.checks.delete("check-id")