Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions eval/verifier-corpus/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,61 @@ uv run security-scanner evaluate \

The `private/` output path is gitignored. Do not commit generated verifier
artifacts.

## Harness corpus (`harness/`) — infra-free accuracy measurement

The 2-case checkout above needs a live Ollama model. The `harness/` subtree is a
**finding-level** corpus (N≥20, all six path-roles × TP/FP) that measures verifier
verdict quality **without** a model or network, so prompt/anchor/threshold changes
get reproducible before/after numbers in plain `uv run pytest`.

Single source of truth: `src/security_scanner/core/evaluation/verifier_corpus.py`
(`CORPUS_CASES`). It deterministically generates three committed artifacts:

- `harness/candidates.jsonl` — the scanner "before" set.
- `harness/expected.json` — ground truth (`load_evaluation_corpus` schema).
- `harness/recorded-ideal.json` — perfect-model recorded baseline (drop-in slot).

Regenerate after editing `CORPUS_CASES` (a freshness test fails if stale):

```bash
uv run python -m security_scanner.core.evaluation.verifier_corpus
```

The harness itself (`core/evaluation/verifier_harness.py`) exposes
`HeuristicVerifierStrategy` (path-role anchor, deterministic) and
`RecordedVerifierStrategy` (replays recorded model JSON), both feeding the
existing `evaluate_verifier_delta` metrics. Gates live in
`tests/test_verifier_harness.py`, `tests/test_path_role_anchors.py`,
`tests/test_verifier_confidence_sweep.py`.

## Live model run (box-return, single command)

Real end-to-end accuracy needs the Tailscale Ubuntu box (Ollama + a loaded model).
No new code is required — it is the one command below. Its output replaces
`harness/recorded-ideal.json` (or `private/`) and refreshes the recorded baseline:

```bash
SECURITY_SCANNER_OLLAMA_HOST=http://<box>:11434 \
SECURITY_SCANNER_OLLAMA_MODEL=<model> \
uv run security-scanner verify \
--findings eval/verifier-corpus/harness/candidates.jsonl \
--output private/harness-verified.jsonl

uv run security-scanner evaluate \
--expected eval/verifier-corpus/harness/expected.json \
--findings eval/verifier-corpus/harness/candidates.jsonl \
--after-findings private/harness-verified.jsonl \
--precision-min 0.5
```

Path-role anchors are tunable per organisation via the verifier YAML, e.g.:

```yaml
verification:
path_role_anchors:
documentation: { label: true_positive, confidence: 0.9, reason: org policy }
```

Unspecified roles keep their defaults (partial merge); the default-off behaviour
is byte-identical to before this corpus existed.
22 changes: 22 additions & 0 deletions eval/verifier-corpus/harness/candidates.jsonl

Large diffs are not rendered by default.

155 changes: 155 additions & 0 deletions eval/verifier-corpus/harness/expected.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
{
"description": "Public-safe finding-level verifier corpus: 6 path-roles x TP/FP for infra-free before/after accuracy measurement.",
"expectedFindings": [
{
"filePath": "config/positive.env",
"lineStart": 4,
"repoFullName": "synthetic-org/verifier-harness",
"ruleId": "synthetic-fake-token"
},
{
"filePath": "config/database.env",
"lineStart": 3,
"repoFullName": "synthetic-org/verifier-harness",
"ruleId": "synthetic-fake-token"
},
{
"filePath": "config/settings.toml",
"lineStart": 10,
"repoFullName": "synthetic-org/verifier-harness",
"ruleId": "synthetic-api-key"
},
{
"filePath": "settings/prod.yaml",
"lineStart": 5,
"repoFullName": "synthetic-org/verifier-harness",
"ruleId": "synthetic-fake-token"
},
{
"filePath": "src/app/secrets.py",
"lineStart": 42,
"repoFullName": "synthetic-org/verifier-harness",
"ruleId": "synthetic-fake-token"
},
{
"filePath": "internal/auth.go",
"lineStart": 18,
"repoFullName": "synthetic-org/verifier-harness",
"ruleId": "synthetic-api-key"
},
{
"filePath": "lib/client.rb",
"lineStart": 7,
"repoFullName": "synthetic-org/verifier-harness",
"ruleId": "synthetic-fake-token"
},
{
"filePath": "services/payment.ts",
"lineStart": 25,
"repoFullName": "synthetic-org/verifier-harness",
"ruleId": "synthetic-fake-token"
}
],
"knownNegatives": [
{
"filePath": "docs/example.md",
"lineStart": 5,
"reason": "Documentation shows a synthetic token-shaped example.",
"repoFullName": "synthetic-org/verifier-harness",
"ruleId": "synthetic-fake-token"
},
{
"filePath": "docs/setup.md",
"lineStart": 12,
"reason": "Setup guide illustrates a fake token.",
"repoFullName": "synthetic-org/verifier-harness",
"ruleId": "synthetic-fake-token"
},
{
"filePath": "README.rst",
"lineStart": 4,
"reason": "README sample value, not a real credential.",
"repoFullName": "synthetic-org/verifier-harness",
"ruleId": "synthetic-fake-token"
},
{
"filePath": "docs/api/reference.txt",
"lineStart": 9,
"reason": "API reference text fixture.",
"repoFullName": "synthetic-org/verifier-harness",
"ruleId": "synthetic-api-key"
},
{
"filePath": "examples/quickstart.py",
"lineStart": 6,
"reason": "Quickstart example placeholder.",
"repoFullName": "synthetic-org/verifier-harness",
"ruleId": "synthetic-fake-token"
},
{
"filePath": "samples/config.env",
"lineStart": 3,
"reason": "Sample config placeholder.",
"repoFullName": "synthetic-org/verifier-harness",
"ruleId": "synthetic-fake-token"
},
{
"filePath": "test/fixtures/creds.json",
"lineStart": 2,
"reason": "Test fixture credentials (synthetic).",
"repoFullName": "synthetic-org/verifier-harness",
"ruleId": "synthetic-api-key"
},
{
"filePath": "tests/test_login.py",
"lineStart": 30,
"reason": "Test asserts against a fake token.",
"repoFullName": "synthetic-org/verifier-harness",
"ruleId": "synthetic-fake-token"
},
{
"filePath": "config/legacy.env",
"lineStart": 8,
"reason": "Legacy placeholder token left in config; not live.",
"repoFullName": "synthetic-org/verifier-harness",
"ruleId": "synthetic-fake-token"
},
{
"filePath": "config/template.toml",
"lineStart": 6,
"reason": "Template token placeholder.",
"repoFullName": "synthetic-org/verifier-harness",
"ruleId": "synthetic-api-key"
},
{
"filePath": "src/utils/format.py",
"lineStart": 15,
"reason": "Public sample constant shaped like a token.",
"repoFullName": "synthetic-org/verifier-harness",
"ruleId": "synthetic-fake-token"
},
{
"filePath": "internal/consts.go",
"lineStart": 3,
"reason": "Documented constant, not a credential.",
"repoFullName": "synthetic-org/verifier-harness",
"ruleId": "synthetic-fake-token"
},
{
"filePath": "data/blob.bin",
"lineStart": 1,
"reason": "Opaque blob, role not specific.",
"repoFullName": "synthetic-org/verifier-harness",
"ruleId": "synthetic-fake-token"
},
{
"filePath": "Makefile",
"lineStart": 9,
"reason": "Build file reference, role not specific.",
"repoFullName": "synthetic-org/verifier-harness",
"ruleId": "synthetic-fake-token"
}
],
"name": "synthetic-verifier-harness-v1",
"schemaVersion": 1
}
115 changes: 115 additions & 0 deletions eval/verifier-corpus/harness/recorded-ideal.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
{
"name": "ideal-perfect-model-v1",
"responses": {
"finding_02cc46475bfdd191": {
"confidence": 0.95,
"label": "false_positive",
"reason": "API reference text fixture."
},
"finding_223611539975a65b": {
"confidence": 0.95,
"label": "false_positive",
"reason": "Opaque blob, role not specific."
},
"finding_24e40c8b470bd6a3": {
"confidence": 0.95,
"label": "true_positive",
"reason": "Synthetic TP case."
},
"finding_2ee79410cd93d57c": {
"confidence": 0.95,
"label": "true_positive",
"reason": "Synthetic TP case."
},
"finding_34de71c589210c56": {
"confidence": 0.95,
"label": "false_positive",
"reason": "Build file reference, role not specific."
},
"finding_3fae230b287cbfb8": {
"confidence": 0.95,
"label": "false_positive",
"reason": "Documentation shows a synthetic token-shaped example."
},
"finding_40ee8887f222fbd0": {
"confidence": 0.95,
"label": "false_positive",
"reason": "Public sample constant shaped like a token."
},
"finding_5edd52c83c9d4c88": {
"confidence": 0.95,
"label": "false_positive",
"reason": "Template token placeholder."
},
"finding_5fa7b574d009b3ed": {
"confidence": 0.95,
"label": "false_positive",
"reason": "Sample config placeholder."
},
"finding_629466f031c50d61": {
"confidence": 0.95,
"label": "false_positive",
"reason": "README sample value, not a real credential."
},
"finding_9a09f843316d378e": {
"confidence": 0.95,
"label": "false_positive",
"reason": "Legacy placeholder token left in config; not live."
},
"finding_a46388d7dd53fd56": {
"confidence": 0.95,
"label": "false_positive",
"reason": "Quickstart example placeholder."
},
"finding_a678f78485e50ab0": {
"confidence": 0.95,
"label": "true_positive",
"reason": "Synthetic TP case."
},
"finding_aa1515e56f8c78a2": {
"confidence": 0.95,
"label": "true_positive",
"reason": "Synthetic TP case."
},
"finding_aae8718fae0433dd": {
"confidence": 0.95,
"label": "true_positive",
"reason": "Synthetic TP case."
},
"finding_cdb1a4dec3338070": {
"confidence": 0.95,
"label": "false_positive",
"reason": "Test asserts against a fake token."
},
"finding_e634b9be1f9231d8": {
"confidence": 0.95,
"label": "false_positive",
"reason": "Documented constant, not a credential."
},
"finding_ea73afe698f3d962": {
"confidence": 0.95,
"label": "false_positive",
"reason": "Test fixture credentials (synthetic)."
},
"finding_eee7d32f875d1e7e": {
"confidence": 0.95,
"label": "true_positive",
"reason": "Synthetic TP case."
},
"finding_f5629a9839992c40": {
"confidence": 0.95,
"label": "false_positive",
"reason": "Setup guide illustrates a fake token."
},
"finding_ff80007ea24ebd40": {
"confidence": 0.95,
"label": "true_positive",
"reason": "Synthetic TP case."
},
"finding_ffd80950f49a3db2": {
"confidence": 0.95,
"label": "true_positive",
"reason": "Synthetic TP case."
}
}
}
20 changes: 20 additions & 0 deletions src/security_scanner/core/evaluation/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,18 @@
render_evaluation_report,
render_verifier_delta_report,
)
from security_scanner.core.evaluation.verifier_corpus import (
CORPUS_CASES,
build_corpus_candidates,
build_evaluation_corpus,
build_ideal_responses,
)
from security_scanner.core.evaluation.verifier_harness import (
HeuristicVerifierStrategy,
RecordedVerifierStrategy,
run_corpus_delta,
verify_candidates,
)

__all__ = [
"EvaluationCorpus",
Expand All @@ -34,4 +46,12 @@
"load_evaluation_corpus",
"render_evaluation_report",
"render_verifier_delta_report",
"CORPUS_CASES",
"build_corpus_candidates",
"build_evaluation_corpus",
"build_ideal_responses",
"HeuristicVerifierStrategy",
"RecordedVerifierStrategy",
"run_corpus_delta",
"verify_candidates",
]
Loading
Loading