This would not replace the current eval commands.
It would define what a “complete eval cycle” means.
skills/google-agents-cli-eval/
├── SKILL.md
└── aisp/
└── google_agents_cli_eval_quality_contract_aisp/
├── aisp.aisop.json
├── README.md
└── evals/
└── eval-quality-traces/
[
{
"role": "system",
"content": {
"protocol": "AISP V1.0.0",
"axiom_0": "Human_Sovereignty_and_Wellbeing",
"id": "google_agents_cli_eval_quality_contract_aisp",
"name": "Google Agents CLI Eval Quality Contract",
"version": "1.0.0",
"license": "Apache-2.0",
"summary": "A machine-checkable evaluation quality contract for agents-cli eval generate, grade, analyze, compare, and optimize workflows.",
"description": "Defines dataset readiness, trace generation, grading, metric thresholds, failure clustering, optimization comparison, and deploy-readiness evidence for ADK agent evaluation.",
"flow_format": "mermaid",
"loading_mode": "node",
"tools": [
"filesystem",
"shell"
],
"params": {
"project_root": "string",
"eval_dataset_path": "string",
"eval_config_path": "string?",
"baseline_results_path": "string?"
},
"system_prompt": ""
}
},
{
"role": "user",
"content": {
"instruction": "STRICTLY OBEY aisp_contract; its non_negotiable rules are inviolable; then RUN aisop.main",
"user_input": "{user_input}",
"aisp_contract": {
"profile": "aisp.skill.v1",
"invocation": {
"mode": "eval_quality_gate",
"when_to_use": [
"before running agents-cli eval generate",
"before running agents-cli eval grade",
"before using eval results as deployment evidence",
"before running eval optimize",
"when comparing baseline and optimized agent behavior"
],
"when_not_to_use": [
"unit tests that only validate deterministic code",
"manual exploratory chat with no eval dataset",
"local smoke test not intended as deployment evidence"
]
},
"non_negotiable": [
{
"rule": "Do not treat pytest output as a substitute for agent behavior eval.",
"enforced_by": "classify_eval.step2:sys.assert"
},
{
"rule": "Eval dataset must include expected behavior, allowed tools, failure conditions, or rubric fields.",
"enforced_by": "dataset_gate.step3:sys.assert"
},
{
"rule": "Eval generate must produce traces before eval grade runs.",
"enforced_by": "generate.step3:sys.assert"
},
{
"rule": "Eval grade must record metrics, failures, and grading configuration.",
"enforced_by": "grade.step3:sys.assert"
},
{
"rule": "Eval analyze must cluster or summarize failure modes before optimization is accepted.",
"enforced_by": "analyze.step2:sys.assert"
},
{
"rule": "Eval optimize must compare before/after results before claiming improvement.",
"enforced_by": "compare.step2:sys.assert"
},
{
"rule": "Deploy readiness must not be based on eval status without dataset id, trace id, grade result, and threshold status.",
"enforced_by": "readiness.step3:sys.assert"
}
],
"discovery": {
"category": "evaluation",
"tags": [
"agents-cli",
"eval",
"dataset",
"grade",
"failure-analysis",
"optimization",
"deploy-readiness",
"aisp"
]
},
"risk_level": "medium",
"resources": [
{
"id": "eval_dataset",
"path": "{eval_dataset_path}",
"kind": "eval_dataset",
"mode": "read_only",
"when": "Read before eval generate.",
"scope": "skill"
},
{
"id": "eval_config",
"path": "{eval_config_path}",
"kind": "config",
"mode": "read_only",
"when": "Read before eval grade.",
"scope": "skill"
},
{
"id": "baseline_results",
"path": "{baseline_results_path}",
"kind": "eval_results",
"mode": "read_only",
"when": "Read before eval compare or optimize.",
"scope": "skill"
}
]
},
"aisop": {
"main": "graph TD\n classify_eval[Classify eval purpose] --> dataset_gate[Validate dataset]\n dataset_gate --> generate[Run eval generate]\n generate --> grade[Run eval grade]\n grade --> analyze[Analyze failures]\n analyze --> optimize[Optimize if requested]\n optimize --> compare[Compare before after]\n compare --> readiness[Emit deploy readiness evidence]\n readiness --> trace[Write eval quality trace]\n trace --> end_node((End))"
},
"functions": {
"classify_eval": {
"step1": "Classify whether this is behavior eval, deterministic unit test, smoke test, regression eval, deployment evidence, or optimization eval.",
"step2": "sys.assert('behavior eval is distinct from pytest/unit tests', 'pytest is not a substitute for agent behavior eval')",
"output_mapping": "eval_purpose"
},
"dataset_gate": {
"step1": "Read eval_dataset_path.",
"step2": "Validate dataset has cases with prompts, expected behavior, rubric, allowed tool behavior, or failure criteria.",
"step3": "sys.assert('eval dataset has required behavioral criteria', 'Eval dataset missing behavior/rubric fields')",
"output_mapping": "dataset_validation"
},
"generate": {
"step1": "Run agents-cli eval generate or equivalent.",
"step2": "Record trace output path and case count.",
"step3": "sys.assert('eval traces generated before grading', 'Eval grade requires generated traces')",
"output_mapping": "eval_generate_status"
},
"grade": {
"step1": "Run agents-cli eval grade with eval_config_path when provided.",
"step2": "Record metrics, score, failures, judge config, and thresholds.",
"step3": "sys.assert('eval grade result includes metrics and config', 'Eval grade result incomplete')",
"output_mapping": "eval_grade_status"
},
"analyze": {
"step1": "Cluster or summarize failure modes from eval_grade_status.",
"step2": "sys.assert('failure modes analyzed before optimization acceptance', 'Eval failures must be analyzed')",
"output_mapping": "failure_analysis"
},
"optimize": {
"step1": "If optimization requested, run prompt/agent optimization using eval data.",
"output_mapping": "optimization_status"
},
"compare": {
"step1": "Compare baseline and optimized eval results when optimization occurs.",
"step2": "sys.assert('before/after eval comparison exists for optimization claim', 'Optimization requires before/after comparison')",
"output_mapping": "eval_comparison"
},
"readiness": {
"step1": "Compute deploy readiness evidence from dataset_validation, eval_generate_status, eval_grade_status, failure_analysis, and eval_comparison.",
"step2": "Record dataset id, trace id, grade result, threshold status, and failure summary.",
"step3": "sys.assert('deploy readiness includes eval evidence fields', 'Deploy readiness evidence incomplete')",
"output_mapping": "eval_readiness_evidence"
},
"trace": {
"step1": "Write eval quality trace with dataset validation, trace paths, grade results, failure clusters, comparison, and readiness status.",
"output_mapping": "eval_quality_trace"
},
"end_node": {
"step1": "Return eval_quality_trace and eval_readiness_evidence."
}
}
}
}
]
This would make eval runs more useful as engineering evidence.
It would also help coding agents avoid the common mistake of saying “tests passed” when only deterministic unit tests ran and no agent behavior eval was performed.
This should start as a contract + report shape, not as a required new runtime.
What is your feature suggestion?
Add an optional machine-readable evaluation quality contract for
agents-cli eval.The eval commands are already one of the strongest parts of agents-cli:
I propose adding an optional AISP V1.0.0 companion package that describes the expected eval lifecycle and trace requirements.
AISP protocol reference:
https://github.com/AIXP-Labs/AISP
This would not replace the current eval commands.
It would define what a “complete eval cycle” means.
Possible layout:
A standard AISP V1.0.0 package could look like:
[ { "role": "system", "content": { "protocol": "AISP V1.0.0", "axiom_0": "Human_Sovereignty_and_Wellbeing", "id": "google_agents_cli_eval_quality_contract_aisp", "name": "Google Agents CLI Eval Quality Contract", "version": "1.0.0", "license": "Apache-2.0", "summary": "A machine-checkable evaluation quality contract for agents-cli eval generate, grade, analyze, compare, and optimize workflows.", "description": "Defines dataset readiness, trace generation, grading, metric thresholds, failure clustering, optimization comparison, and deploy-readiness evidence for ADK agent evaluation.", "flow_format": "mermaid", "loading_mode": "node", "tools": [ "filesystem", "shell" ], "params": { "project_root": "string", "eval_dataset_path": "string", "eval_config_path": "string?", "baseline_results_path": "string?" }, "system_prompt": "" } }, { "role": "user", "content": { "instruction": "STRICTLY OBEY aisp_contract; its non_negotiable rules are inviolable; then RUN aisop.main", "user_input": "{user_input}", "aisp_contract": { "profile": "aisp.skill.v1", "invocation": { "mode": "eval_quality_gate", "when_to_use": [ "before running agents-cli eval generate", "before running agents-cli eval grade", "before using eval results as deployment evidence", "before running eval optimize", "when comparing baseline and optimized agent behavior" ], "when_not_to_use": [ "unit tests that only validate deterministic code", "manual exploratory chat with no eval dataset", "local smoke test not intended as deployment evidence" ] }, "non_negotiable": [ { "rule": "Do not treat pytest output as a substitute for agent behavior eval.", "enforced_by": "classify_eval.step2:sys.assert" }, { "rule": "Eval dataset must include expected behavior, allowed tools, failure conditions, or rubric fields.", "enforced_by": "dataset_gate.step3:sys.assert" }, { "rule": "Eval generate must produce traces before eval grade runs.", "enforced_by": "generate.step3:sys.assert" }, { "rule": "Eval grade must record metrics, failures, and grading configuration.", "enforced_by": "grade.step3:sys.assert" }, { "rule": "Eval analyze must cluster or summarize failure modes before optimization is accepted.", "enforced_by": "analyze.step2:sys.assert" }, { "rule": "Eval optimize must compare before/after results before claiming improvement.", "enforced_by": "compare.step2:sys.assert" }, { "rule": "Deploy readiness must not be based on eval status without dataset id, trace id, grade result, and threshold status.", "enforced_by": "readiness.step3:sys.assert" } ], "discovery": { "category": "evaluation", "tags": [ "agents-cli", "eval", "dataset", "grade", "failure-analysis", "optimization", "deploy-readiness", "aisp" ] }, "risk_level": "medium", "resources": [ { "id": "eval_dataset", "path": "{eval_dataset_path}", "kind": "eval_dataset", "mode": "read_only", "when": "Read before eval generate.", "scope": "skill" }, { "id": "eval_config", "path": "{eval_config_path}", "kind": "config", "mode": "read_only", "when": "Read before eval grade.", "scope": "skill" }, { "id": "baseline_results", "path": "{baseline_results_path}", "kind": "eval_results", "mode": "read_only", "when": "Read before eval compare or optimize.", "scope": "skill" } ] }, "aisop": { "main": "graph TD\n classify_eval[Classify eval purpose] --> dataset_gate[Validate dataset]\n dataset_gate --> generate[Run eval generate]\n generate --> grade[Run eval grade]\n grade --> analyze[Analyze failures]\n analyze --> optimize[Optimize if requested]\n optimize --> compare[Compare before after]\n compare --> readiness[Emit deploy readiness evidence]\n readiness --> trace[Write eval quality trace]\n trace --> end_node((End))" }, "functions": { "classify_eval": { "step1": "Classify whether this is behavior eval, deterministic unit test, smoke test, regression eval, deployment evidence, or optimization eval.", "step2": "sys.assert('behavior eval is distinct from pytest/unit tests', 'pytest is not a substitute for agent behavior eval')", "output_mapping": "eval_purpose" }, "dataset_gate": { "step1": "Read eval_dataset_path.", "step2": "Validate dataset has cases with prompts, expected behavior, rubric, allowed tool behavior, or failure criteria.", "step3": "sys.assert('eval dataset has required behavioral criteria', 'Eval dataset missing behavior/rubric fields')", "output_mapping": "dataset_validation" }, "generate": { "step1": "Run agents-cli eval generate or equivalent.", "step2": "Record trace output path and case count.", "step3": "sys.assert('eval traces generated before grading', 'Eval grade requires generated traces')", "output_mapping": "eval_generate_status" }, "grade": { "step1": "Run agents-cli eval grade with eval_config_path when provided.", "step2": "Record metrics, score, failures, judge config, and thresholds.", "step3": "sys.assert('eval grade result includes metrics and config', 'Eval grade result incomplete')", "output_mapping": "eval_grade_status" }, "analyze": { "step1": "Cluster or summarize failure modes from eval_grade_status.", "step2": "sys.assert('failure modes analyzed before optimization acceptance', 'Eval failures must be analyzed')", "output_mapping": "failure_analysis" }, "optimize": { "step1": "If optimization requested, run prompt/agent optimization using eval data.", "output_mapping": "optimization_status" }, "compare": { "step1": "Compare baseline and optimized eval results when optimization occurs.", "step2": "sys.assert('before/after eval comparison exists for optimization claim', 'Optimization requires before/after comparison')", "output_mapping": "eval_comparison" }, "readiness": { "step1": "Compute deploy readiness evidence from dataset_validation, eval_generate_status, eval_grade_status, failure_analysis, and eval_comparison.", "step2": "Record dataset id, trace id, grade result, threshold status, and failure summary.", "step3": "sys.assert('deploy readiness includes eval evidence fields', 'Deploy readiness evidence incomplete')", "output_mapping": "eval_readiness_evidence" }, "trace": { "step1": "Write eval quality trace with dataset validation, trace paths, grade results, failure clusters, comparison, and readiness status.", "output_mapping": "eval_quality_trace" }, "end_node": { "step1": "Return eval_quality_trace and eval_readiness_evidence." } } } } ]What will this enable you to do?
This would make eval runs more useful as engineering evidence.
It would enable:
It would also help coding agents avoid the common mistake of saying “tests passed” when only deterministic unit tests ran and no agent behavior eval was performed.
Additional context
This should start as a contract + report shape, not as a required new runtime.
Non-goals:
MVP:
eval_readiness_evidencethat deploy or CI can read later.