From a7d673644ae385b9ef7e14b405d35e2cba59ad93 Mon Sep 17 00:00:00 2001 From: pureliture Date: Sat, 20 Jun 2026 00:42:32 +0900 Subject: [PATCH 1/2] feat(verifier): infra-free verdict-quality measurement substrate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 검증 알고리즘 고도화: live Ollama 없이 verdict 품질을 측정/회귀방지하는 결정론적 substrate. - core/evaluation/verifier_harness.py: Heuristic/Recorded 전략 + run_corpus_delta (metrics.py 재사용) - core/evaluation/verifier_corpus.py: 단일 SoT CORPUS_CASES(22건, 6 path-role) + 생성기 - eval/verifier-corpus/harness/: candidates.jsonl/expected.json/recorded-ideal.json (생성물, freshness 테스트로 stale 차단) - prompt.py: path-role 앵커 추출(DEFAULT_PATH_ROLE_ANCHORS) + 설정 가능 path_role_decision - verifier.py/client.py/verify_artifact.py: yaml verification.path_role_anchors 배선(default byte-identical, I3 보존) - tests: 정확도 게이트(FR3)/path-role 파라메트라이즈(FR4)/min_confidence sweep(FR6) heuristic baseline: before precision 0.3636 -> after 0.5714, FP_reduction 8, recall 1.0 보존. min_confidence 0.60 유지(플래토 <=0.80). 전체 uv run pytest 746 green. I1~I6 불변식 보존. Co-Authored-By: Claude Opus 4.8 --- eval/verifier-corpus/README.md | 58 +++ eval/verifier-corpus/harness/candidates.jsonl | 22 ++ eval/verifier-corpus/harness/expected.json | 155 ++++++++ .../harness/recorded-ideal.json | 115 ++++++ .../core/evaluation/__init__.py | 20 + .../core/evaluation/verifier_corpus.py | 352 ++++++++++++++++++ .../core/evaluation/verifier_harness.py | 106 ++++++ src/security_scanner/llm/common/prompt.py | 92 ++++- src/security_scanner/llm/common/verifier.py | 13 +- src/security_scanner/llm/ollama/client.py | 11 +- .../runtime/verify_artifact.py | 55 ++- tests/test_path_role_anchors.py | 179 +++++++++ tests/test_verifier_confidence_sweep.py | 62 +++ tests/test_verifier_harness.py | 145 ++++++++ 14 files changed, 1362 insertions(+), 23 deletions(-) create mode 100644 eval/verifier-corpus/harness/candidates.jsonl create mode 100644 eval/verifier-corpus/harness/expected.json create mode 100644 eval/verifier-corpus/harness/recorded-ideal.json create mode 100644 src/security_scanner/core/evaluation/verifier_corpus.py create mode 100644 src/security_scanner/core/evaluation/verifier_harness.py create mode 100644 tests/test_path_role_anchors.py create mode 100644 tests/test_verifier_confidence_sweep.py create mode 100644 tests/test_verifier_harness.py diff --git a/eval/verifier-corpus/README.md b/eval/verifier-corpus/README.md index 5a7a836..e0c2f41 100644 --- a/eval/verifier-corpus/README.md +++ b/eval/verifier-corpus/README.md @@ -26,3 +26,61 @@ uv run security-scanner evaluate \ The `private/` output path is gitignored. Do not commit generated verifier artifacts. + +## Harness corpus (`harness/`) — infra-free accuracy measurement + +The 2-case checkout above needs a live Ollama model. The `harness/` subtree is a +**finding-level** corpus (N≥20, all six path-roles × TP/FP) that measures verifier +verdict quality **without** a model or network, so prompt/anchor/threshold changes +get reproducible before/after numbers in plain `uv run pytest`. + +Single source of truth: `src/security_scanner/core/evaluation/verifier_corpus.py` +(`CORPUS_CASES`). It deterministically generates three committed artifacts: + +- `harness/candidates.jsonl` — the scanner "before" set. +- `harness/expected.json` — ground truth (`load_evaluation_corpus` schema). +- `harness/recorded-ideal.json` — perfect-model recorded baseline (drop-in slot). + +Regenerate after editing `CORPUS_CASES` (a freshness test fails if stale): + +```bash +uv run python -m security_scanner.core.evaluation.verifier_corpus +``` + +The harness itself (`core/evaluation/verifier_harness.py`) exposes +`HeuristicVerifierStrategy` (path-role anchor, deterministic) and +`RecordedVerifierStrategy` (replays recorded model JSON), both feeding the +existing `evaluate_verifier_delta` metrics. Gates live in +`tests/test_verifier_harness.py`, `tests/test_path_role_anchors.py`, +`tests/test_verifier_confidence_sweep.py`. + +## Live model run (box-return, single command) + +Real end-to-end accuracy needs the Tailscale Ubuntu box (Ollama + a loaded model). +No new code is required — it is the one command below. Its output replaces +`harness/recorded-ideal.json` (or `private/`) and refreshes the recorded baseline: + +```bash +SECURITY_SCANNER_OLLAMA_HOST=http://:11434 \ +SECURITY_SCANNER_OLLAMA_MODEL= \ +uv run security-scanner verify \ + --findings eval/verifier-corpus/harness/candidates.jsonl \ + --output private/harness-verified.jsonl + +uv run security-scanner evaluate \ + --expected eval/verifier-corpus/harness/expected.json \ + --findings eval/verifier-corpus/harness/candidates.jsonl \ + --after-findings private/harness-verified.jsonl \ + --precision-min 0.5 +``` + +Path-role anchors are tunable per organisation via the verifier YAML, e.g.: + +```yaml +verification: + path_role_anchors: + documentation: { label: true_positive, confidence: 0.9, reason: org policy } +``` + +Unspecified roles keep their defaults (partial merge); the default-off behaviour +is byte-identical to before this corpus existed. diff --git a/eval/verifier-corpus/harness/candidates.jsonl b/eval/verifier-corpus/harness/candidates.jsonl new file mode 100644 index 0000000..74cce63 --- /dev/null +++ b/eval/verifier-corpus/harness/candidates.jsonl @@ -0,0 +1,22 @@ +{"category": "SECRET", "confidence": "MEDIUM", "evidence": {"contextArtifactRef": null, "redacted": true, "secretHash": "salted-sha256:4790348bcad6a6223dca2af688c82c3d8c2a430a94734b9cc2c1078e688b4822"}, "findingId": "finding_aae8718fae0433dd", "fingerprint": "[\"synthetic-org/verifier-harness\",\"config/positive.env\",4,\"synthetic-fake-token\"]", "gitleaks": null, "location": {"filePath": "config/positive.env", "lineEnd": null, "lineStart": 4}, "repo": {"branch": null, "commit": null, "fullName": "synthetic-org/verifier-harness"}, "ruleId": "synthetic-fake-token", "scan": {"rulePackVersion": "secret-rules-0.1.0", "scanRunId": "scan_harness"}, "severity": "HIGH", "sourceTool": "gitleaks", "sourceToolVersion": null, "status": "OPEN", "triage": {"reason": null, "verdict": "NEEDS_REVIEW", "verifier": null}} +{"category": "SECRET", "confidence": "MEDIUM", "evidence": {"contextArtifactRef": null, "redacted": true, "secretHash": "salted-sha256:36697503eb7b8e666cbf9826cedc2968069ddca3ddf0cb7f24f685af4dd999a7"}, "findingId": "finding_24e40c8b470bd6a3", "fingerprint": "[\"synthetic-org/verifier-harness\",\"config/database.env\",3,\"synthetic-fake-token\"]", "gitleaks": null, "location": {"filePath": "config/database.env", "lineEnd": null, "lineStart": 3}, "repo": {"branch": null, "commit": null, "fullName": "synthetic-org/verifier-harness"}, "ruleId": "synthetic-fake-token", "scan": {"rulePackVersion": "secret-rules-0.1.0", "scanRunId": "scan_harness"}, "severity": "HIGH", "sourceTool": "gitleaks", "sourceToolVersion": null, "status": "OPEN", "triage": {"reason": null, "verdict": "NEEDS_REVIEW", "verifier": null}} +{"category": "SECRET", "confidence": "MEDIUM", "evidence": {"contextArtifactRef": null, "redacted": true, "secretHash": "salted-sha256:563816a7ca1664428aea4b64630d829f1147ce6f888d123e69c1e412e75dec9e"}, "findingId": "finding_2ee79410cd93d57c", "fingerprint": "[\"synthetic-org/verifier-harness\",\"config/settings.toml\",10,\"synthetic-api-key\"]", "gitleaks": null, "location": {"filePath": "config/settings.toml", "lineEnd": null, "lineStart": 10}, "repo": {"branch": null, "commit": null, "fullName": "synthetic-org/verifier-harness"}, "ruleId": "synthetic-api-key", "scan": {"rulePackVersion": "secret-rules-0.1.0", "scanRunId": "scan_harness"}, "severity": "HIGH", "sourceTool": "gitleaks", "sourceToolVersion": null, "status": "OPEN", "triage": {"reason": null, "verdict": "NEEDS_REVIEW", "verifier": null}} +{"category": "SECRET", "confidence": "MEDIUM", "evidence": {"contextArtifactRef": null, "redacted": true, "secretHash": "salted-sha256:05a58bbd6d879652cdffa48cfdeff1b3ede451bd2dda187ed82d67b76ebb214a"}, "findingId": "finding_a678f78485e50ab0", "fingerprint": "[\"synthetic-org/verifier-harness\",\"settings/prod.yaml\",5,\"synthetic-fake-token\"]", "gitleaks": null, "location": {"filePath": "settings/prod.yaml", "lineEnd": null, "lineStart": 5}, "repo": {"branch": null, "commit": null, "fullName": "synthetic-org/verifier-harness"}, "ruleId": "synthetic-fake-token", "scan": {"rulePackVersion": "secret-rules-0.1.0", "scanRunId": "scan_harness"}, "severity": "HIGH", "sourceTool": "gitleaks", "sourceToolVersion": null, "status": "OPEN", "triage": {"reason": null, "verdict": "NEEDS_REVIEW", "verifier": null}} +{"category": "SECRET", "confidence": "MEDIUM", "evidence": {"contextArtifactRef": null, "redacted": true, "secretHash": "salted-sha256:200378dd0b07270c269d93eb6a6d4ce082b876a9a617d14c0d6bbbf5c8784009"}, "findingId": "finding_ff80007ea24ebd40", "fingerprint": "[\"synthetic-org/verifier-harness\",\"src/app/secrets.py\",42,\"synthetic-fake-token\"]", "gitleaks": null, "location": {"filePath": "src/app/secrets.py", "lineEnd": null, "lineStart": 42}, "repo": {"branch": null, "commit": null, "fullName": "synthetic-org/verifier-harness"}, "ruleId": "synthetic-fake-token", "scan": {"rulePackVersion": "secret-rules-0.1.0", "scanRunId": "scan_harness"}, "severity": "HIGH", "sourceTool": "gitleaks", "sourceToolVersion": null, "status": "OPEN", "triage": {"reason": null, "verdict": "NEEDS_REVIEW", "verifier": null}} +{"category": "SECRET", "confidence": "MEDIUM", "evidence": {"contextArtifactRef": null, "redacted": true, "secretHash": "salted-sha256:0428bb3795c96342d2e5094f8a685c7db4498570dd82f28fa6642414749ae902"}, "findingId": "finding_ffd80950f49a3db2", "fingerprint": "[\"synthetic-org/verifier-harness\",\"internal/auth.go\",18,\"synthetic-api-key\"]", "gitleaks": null, "location": {"filePath": "internal/auth.go", "lineEnd": null, "lineStart": 18}, "repo": {"branch": null, "commit": null, "fullName": "synthetic-org/verifier-harness"}, "ruleId": "synthetic-api-key", "scan": {"rulePackVersion": "secret-rules-0.1.0", "scanRunId": "scan_harness"}, "severity": "HIGH", "sourceTool": "gitleaks", "sourceToolVersion": null, "status": "OPEN", "triage": {"reason": null, "verdict": "NEEDS_REVIEW", "verifier": null}} +{"category": "SECRET", "confidence": "MEDIUM", "evidence": {"contextArtifactRef": null, "redacted": true, "secretHash": "salted-sha256:0497dd5b129740eea129100c4b10d9a474df27715c5f58f0d6c1fa58782efd34"}, "findingId": "finding_eee7d32f875d1e7e", "fingerprint": "[\"synthetic-org/verifier-harness\",\"lib/client.rb\",7,\"synthetic-fake-token\"]", "gitleaks": null, "location": {"filePath": "lib/client.rb", "lineEnd": null, "lineStart": 7}, "repo": {"branch": null, "commit": null, "fullName": "synthetic-org/verifier-harness"}, "ruleId": "synthetic-fake-token", "scan": {"rulePackVersion": "secret-rules-0.1.0", "scanRunId": "scan_harness"}, "severity": "HIGH", "sourceTool": "gitleaks", "sourceToolVersion": null, "status": "OPEN", "triage": {"reason": null, "verdict": "NEEDS_REVIEW", "verifier": null}} +{"category": "SECRET", "confidence": "MEDIUM", "evidence": {"contextArtifactRef": null, "redacted": true, "secretHash": "salted-sha256:5188f495c0e5f4127adfb06b8c1874cdc79c825fa359ba59592183fd35028a39"}, "findingId": "finding_aa1515e56f8c78a2", "fingerprint": "[\"synthetic-org/verifier-harness\",\"services/payment.ts\",25,\"synthetic-fake-token\"]", "gitleaks": null, "location": {"filePath": "services/payment.ts", "lineEnd": null, "lineStart": 25}, "repo": {"branch": null, "commit": null, "fullName": "synthetic-org/verifier-harness"}, "ruleId": "synthetic-fake-token", "scan": {"rulePackVersion": "secret-rules-0.1.0", "scanRunId": "scan_harness"}, "severity": "HIGH", "sourceTool": "gitleaks", "sourceToolVersion": null, "status": "OPEN", "triage": {"reason": null, "verdict": "NEEDS_REVIEW", "verifier": null}} +{"category": "SECRET", "confidence": "MEDIUM", "evidence": {"contextArtifactRef": null, "redacted": true, "secretHash": "salted-sha256:b201e8ee66351cf1d2865230fdb3d92e697b095da0f5f82f8da70ddfe2fc6c4c"}, "findingId": "finding_3fae230b287cbfb8", "fingerprint": "[\"synthetic-org/verifier-harness\",\"docs/example.md\",5,\"synthetic-fake-token\"]", "gitleaks": null, "location": {"filePath": "docs/example.md", "lineEnd": null, "lineStart": 5}, "repo": {"branch": null, "commit": null, "fullName": "synthetic-org/verifier-harness"}, "ruleId": "synthetic-fake-token", "scan": {"rulePackVersion": "secret-rules-0.1.0", "scanRunId": "scan_harness"}, "severity": "HIGH", "sourceTool": "gitleaks", "sourceToolVersion": null, "status": "OPEN", "triage": {"reason": null, "verdict": "NEEDS_REVIEW", "verifier": null}} +{"category": "SECRET", "confidence": "MEDIUM", "evidence": {"contextArtifactRef": null, "redacted": true, "secretHash": "salted-sha256:8632a394f8f7b33c2eaf08dafffbd2e2e643deb3a3110ab29e9a9ac4e6d5455d"}, "findingId": "finding_f5629a9839992c40", "fingerprint": "[\"synthetic-org/verifier-harness\",\"docs/setup.md\",12,\"synthetic-fake-token\"]", "gitleaks": null, "location": {"filePath": "docs/setup.md", "lineEnd": null, "lineStart": 12}, "repo": {"branch": null, "commit": null, "fullName": "synthetic-org/verifier-harness"}, "ruleId": "synthetic-fake-token", "scan": {"rulePackVersion": "secret-rules-0.1.0", "scanRunId": "scan_harness"}, "severity": "HIGH", "sourceTool": "gitleaks", "sourceToolVersion": null, "status": "OPEN", "triage": {"reason": null, "verdict": "NEEDS_REVIEW", "verifier": null}} +{"category": "SECRET", "confidence": "MEDIUM", "evidence": {"contextArtifactRef": null, "redacted": true, "secretHash": "salted-sha256:e20883ba227c9148bc1192464d81655f905ec019a68edd60beba403158fdda60"}, "findingId": "finding_629466f031c50d61", "fingerprint": "[\"synthetic-org/verifier-harness\",\"README.rst\",4,\"synthetic-fake-token\"]", "gitleaks": null, "location": {"filePath": "README.rst", "lineEnd": null, "lineStart": 4}, "repo": {"branch": null, "commit": null, "fullName": "synthetic-org/verifier-harness"}, "ruleId": "synthetic-fake-token", "scan": {"rulePackVersion": "secret-rules-0.1.0", "scanRunId": "scan_harness"}, "severity": "HIGH", "sourceTool": "gitleaks", "sourceToolVersion": null, "status": "OPEN", "triage": {"reason": null, "verdict": "NEEDS_REVIEW", "verifier": null}} +{"category": "SECRET", "confidence": "MEDIUM", "evidence": {"contextArtifactRef": null, "redacted": true, "secretHash": "salted-sha256:f22a423f1f94bc681bfcec5a6ceb0641c30818d884020cd09ba12c0885aa1302"}, "findingId": "finding_02cc46475bfdd191", "fingerprint": "[\"synthetic-org/verifier-harness\",\"docs/api/reference.txt\",9,\"synthetic-api-key\"]", "gitleaks": null, "location": {"filePath": "docs/api/reference.txt", "lineEnd": null, "lineStart": 9}, "repo": {"branch": null, "commit": null, "fullName": "synthetic-org/verifier-harness"}, "ruleId": "synthetic-api-key", "scan": {"rulePackVersion": "secret-rules-0.1.0", "scanRunId": "scan_harness"}, "severity": "HIGH", "sourceTool": "gitleaks", "sourceToolVersion": null, "status": "OPEN", "triage": {"reason": null, "verdict": "NEEDS_REVIEW", "verifier": null}} +{"category": "SECRET", "confidence": "MEDIUM", "evidence": {"contextArtifactRef": null, "redacted": true, "secretHash": "salted-sha256:ee4e3939d723306817c21510dec775623e8ff04beabc2b4b02df59050b3eda59"}, "findingId": "finding_a46388d7dd53fd56", "fingerprint": "[\"synthetic-org/verifier-harness\",\"examples/quickstart.py\",6,\"synthetic-fake-token\"]", "gitleaks": null, "location": {"filePath": "examples/quickstart.py", "lineEnd": null, "lineStart": 6}, "repo": {"branch": null, "commit": null, "fullName": "synthetic-org/verifier-harness"}, "ruleId": "synthetic-fake-token", "scan": {"rulePackVersion": "secret-rules-0.1.0", "scanRunId": "scan_harness"}, "severity": "HIGH", "sourceTool": "gitleaks", "sourceToolVersion": null, "status": "OPEN", "triage": {"reason": null, "verdict": "NEEDS_REVIEW", "verifier": null}} +{"category": "SECRET", "confidence": "MEDIUM", "evidence": {"contextArtifactRef": null, "redacted": true, "secretHash": "salted-sha256:6e131c45305ab502454b2d15d5b575ce9be937230284c98e09ff9ae63fe9143c"}, "findingId": "finding_5fa7b574d009b3ed", "fingerprint": "[\"synthetic-org/verifier-harness\",\"samples/config.env\",3,\"synthetic-fake-token\"]", "gitleaks": null, "location": {"filePath": "samples/config.env", "lineEnd": null, "lineStart": 3}, "repo": {"branch": null, "commit": null, "fullName": "synthetic-org/verifier-harness"}, "ruleId": "synthetic-fake-token", "scan": {"rulePackVersion": "secret-rules-0.1.0", "scanRunId": "scan_harness"}, "severity": "HIGH", "sourceTool": "gitleaks", "sourceToolVersion": null, "status": "OPEN", "triage": {"reason": null, "verdict": "NEEDS_REVIEW", "verifier": null}} +{"category": "SECRET", "confidence": "MEDIUM", "evidence": {"contextArtifactRef": null, "redacted": true, "secretHash": "salted-sha256:857364ec729f1044484d6008d5059cdda214e293e98b3ce2a7f7680b8fe6ba34"}, "findingId": "finding_ea73afe698f3d962", "fingerprint": "[\"synthetic-org/verifier-harness\",\"test/fixtures/creds.json\",2,\"synthetic-api-key\"]", "gitleaks": null, "location": {"filePath": "test/fixtures/creds.json", "lineEnd": null, "lineStart": 2}, "repo": {"branch": null, "commit": null, "fullName": "synthetic-org/verifier-harness"}, "ruleId": "synthetic-api-key", "scan": {"rulePackVersion": "secret-rules-0.1.0", "scanRunId": "scan_harness"}, "severity": "HIGH", "sourceTool": "gitleaks", "sourceToolVersion": null, "status": "OPEN", "triage": {"reason": null, "verdict": "NEEDS_REVIEW", "verifier": null}} +{"category": "SECRET", "confidence": "MEDIUM", "evidence": {"contextArtifactRef": null, "redacted": true, "secretHash": "salted-sha256:933a4f9eb729537d9494458accaf80c49ceb42ec54bc01fc026af28fa5d01d18"}, "findingId": "finding_cdb1a4dec3338070", "fingerprint": "[\"synthetic-org/verifier-harness\",\"tests/test_login.py\",30,\"synthetic-fake-token\"]", "gitleaks": null, "location": {"filePath": "tests/test_login.py", "lineEnd": null, "lineStart": 30}, "repo": {"branch": null, "commit": null, "fullName": "synthetic-org/verifier-harness"}, "ruleId": "synthetic-fake-token", "scan": {"rulePackVersion": "secret-rules-0.1.0", "scanRunId": "scan_harness"}, "severity": "HIGH", "sourceTool": "gitleaks", "sourceToolVersion": null, "status": "OPEN", "triage": {"reason": null, "verdict": "NEEDS_REVIEW", "verifier": null}} +{"category": "SECRET", "confidence": "MEDIUM", "evidence": {"contextArtifactRef": null, "redacted": true, "secretHash": "salted-sha256:8e4a339f33bc167a25bf9779f15d7a135d4024a8a80147675763295d87ba3bcb"}, "findingId": "finding_9a09f843316d378e", "fingerprint": "[\"synthetic-org/verifier-harness\",\"config/legacy.env\",8,\"synthetic-fake-token\"]", "gitleaks": null, "location": {"filePath": "config/legacy.env", "lineEnd": null, "lineStart": 8}, "repo": {"branch": null, "commit": null, "fullName": "synthetic-org/verifier-harness"}, "ruleId": "synthetic-fake-token", "scan": {"rulePackVersion": "secret-rules-0.1.0", "scanRunId": "scan_harness"}, "severity": "HIGH", "sourceTool": "gitleaks", "sourceToolVersion": null, "status": "OPEN", "triage": {"reason": null, "verdict": "NEEDS_REVIEW", "verifier": null}} +{"category": "SECRET", "confidence": "MEDIUM", "evidence": {"contextArtifactRef": null, "redacted": true, "secretHash": "salted-sha256:dc02430c55c4e47098c3e717b0fba3f9598616ebc2b94ff0ea6bdf17572626a7"}, "findingId": "finding_5edd52c83c9d4c88", "fingerprint": "[\"synthetic-org/verifier-harness\",\"config/template.toml\",6,\"synthetic-api-key\"]", "gitleaks": null, "location": {"filePath": "config/template.toml", "lineEnd": null, "lineStart": 6}, "repo": {"branch": null, "commit": null, "fullName": "synthetic-org/verifier-harness"}, "ruleId": "synthetic-api-key", "scan": {"rulePackVersion": "secret-rules-0.1.0", "scanRunId": "scan_harness"}, "severity": "HIGH", "sourceTool": "gitleaks", "sourceToolVersion": null, "status": "OPEN", "triage": {"reason": null, "verdict": "NEEDS_REVIEW", "verifier": null}} +{"category": "SECRET", "confidence": "MEDIUM", "evidence": {"contextArtifactRef": null, "redacted": true, "secretHash": "salted-sha256:15f3e7400e3a2aa1f4da8f9abf3fa23a2de27bf227c418104b08cf8fff11c239"}, "findingId": "finding_40ee8887f222fbd0", "fingerprint": "[\"synthetic-org/verifier-harness\",\"src/utils/format.py\",15,\"synthetic-fake-token\"]", "gitleaks": null, "location": {"filePath": "src/utils/format.py", "lineEnd": null, "lineStart": 15}, "repo": {"branch": null, "commit": null, "fullName": "synthetic-org/verifier-harness"}, "ruleId": "synthetic-fake-token", "scan": {"rulePackVersion": "secret-rules-0.1.0", "scanRunId": "scan_harness"}, "severity": "HIGH", "sourceTool": "gitleaks", "sourceToolVersion": null, "status": "OPEN", "triage": {"reason": null, "verdict": "NEEDS_REVIEW", "verifier": null}} +{"category": "SECRET", "confidence": "MEDIUM", "evidence": {"contextArtifactRef": null, "redacted": true, "secretHash": "salted-sha256:922626e7698512c84d1cd796836be9bdb55d019d374811af959b4b2e7676d88e"}, "findingId": "finding_e634b9be1f9231d8", "fingerprint": "[\"synthetic-org/verifier-harness\",\"internal/consts.go\",3,\"synthetic-fake-token\"]", "gitleaks": null, "location": {"filePath": "internal/consts.go", "lineEnd": null, "lineStart": 3}, "repo": {"branch": null, "commit": null, "fullName": "synthetic-org/verifier-harness"}, "ruleId": "synthetic-fake-token", "scan": {"rulePackVersion": "secret-rules-0.1.0", "scanRunId": "scan_harness"}, "severity": "HIGH", "sourceTool": "gitleaks", "sourceToolVersion": null, "status": "OPEN", "triage": {"reason": null, "verdict": "NEEDS_REVIEW", "verifier": null}} +{"category": "SECRET", "confidence": "MEDIUM", "evidence": {"contextArtifactRef": null, "redacted": true, "secretHash": "salted-sha256:ba447519f43eda1b96a5adc9c885539391eddfccc83c40f6adfacb69b287b78f"}, "findingId": "finding_223611539975a65b", "fingerprint": "[\"synthetic-org/verifier-harness\",\"data/blob.bin\",1,\"synthetic-fake-token\"]", "gitleaks": null, "location": {"filePath": "data/blob.bin", "lineEnd": null, "lineStart": 1}, "repo": {"branch": null, "commit": null, "fullName": "synthetic-org/verifier-harness"}, "ruleId": "synthetic-fake-token", "scan": {"rulePackVersion": "secret-rules-0.1.0", "scanRunId": "scan_harness"}, "severity": "HIGH", "sourceTool": "gitleaks", "sourceToolVersion": null, "status": "OPEN", "triage": {"reason": null, "verdict": "NEEDS_REVIEW", "verifier": null}} +{"category": "SECRET", "confidence": "MEDIUM", "evidence": {"contextArtifactRef": null, "redacted": true, "secretHash": "salted-sha256:f67156ede8c78a437d075be8251a6b1146d095262a67ab9b14541866abe31515"}, "findingId": "finding_34de71c589210c56", "fingerprint": "[\"synthetic-org/verifier-harness\",\"Makefile\",9,\"synthetic-fake-token\"]", "gitleaks": null, "location": {"filePath": "Makefile", "lineEnd": null, "lineStart": 9}, "repo": {"branch": null, "commit": null, "fullName": "synthetic-org/verifier-harness"}, "ruleId": "synthetic-fake-token", "scan": {"rulePackVersion": "secret-rules-0.1.0", "scanRunId": "scan_harness"}, "severity": "HIGH", "sourceTool": "gitleaks", "sourceToolVersion": null, "status": "OPEN", "triage": {"reason": null, "verdict": "NEEDS_REVIEW", "verifier": null}} diff --git a/eval/verifier-corpus/harness/expected.json b/eval/verifier-corpus/harness/expected.json new file mode 100644 index 0000000..5d7df94 --- /dev/null +++ b/eval/verifier-corpus/harness/expected.json @@ -0,0 +1,155 @@ +{ + "description": "Public-safe finding-level verifier corpus: 6 path-roles x TP/FP for infra-free before/after accuracy measurement.", + "expectedFindings": [ + { + "filePath": "config/positive.env", + "lineStart": 4, + "repoFullName": "synthetic-org/verifier-harness", + "ruleId": "synthetic-fake-token" + }, + { + "filePath": "config/database.env", + "lineStart": 3, + "repoFullName": "synthetic-org/verifier-harness", + "ruleId": "synthetic-fake-token" + }, + { + "filePath": "config/settings.toml", + "lineStart": 10, + "repoFullName": "synthetic-org/verifier-harness", + "ruleId": "synthetic-api-key" + }, + { + "filePath": "settings/prod.yaml", + "lineStart": 5, + "repoFullName": "synthetic-org/verifier-harness", + "ruleId": "synthetic-fake-token" + }, + { + "filePath": "src/app/secrets.py", + "lineStart": 42, + "repoFullName": "synthetic-org/verifier-harness", + "ruleId": "synthetic-fake-token" + }, + { + "filePath": "internal/auth.go", + "lineStart": 18, + "repoFullName": "synthetic-org/verifier-harness", + "ruleId": "synthetic-api-key" + }, + { + "filePath": "lib/client.rb", + "lineStart": 7, + "repoFullName": "synthetic-org/verifier-harness", + "ruleId": "synthetic-fake-token" + }, + { + "filePath": "services/payment.ts", + "lineStart": 25, + "repoFullName": "synthetic-org/verifier-harness", + "ruleId": "synthetic-fake-token" + } + ], + "knownNegatives": [ + { + "filePath": "docs/example.md", + "lineStart": 5, + "reason": "Documentation shows a synthetic token-shaped example.", + "repoFullName": "synthetic-org/verifier-harness", + "ruleId": "synthetic-fake-token" + }, + { + "filePath": "docs/setup.md", + "lineStart": 12, + "reason": "Setup guide illustrates a fake token.", + "repoFullName": "synthetic-org/verifier-harness", + "ruleId": "synthetic-fake-token" + }, + { + "filePath": "README.rst", + "lineStart": 4, + "reason": "README sample value, not a real credential.", + "repoFullName": "synthetic-org/verifier-harness", + "ruleId": "synthetic-fake-token" + }, + { + "filePath": "docs/api/reference.txt", + "lineStart": 9, + "reason": "API reference text fixture.", + "repoFullName": "synthetic-org/verifier-harness", + "ruleId": "synthetic-api-key" + }, + { + "filePath": "examples/quickstart.py", + "lineStart": 6, + "reason": "Quickstart example placeholder.", + "repoFullName": "synthetic-org/verifier-harness", + "ruleId": "synthetic-fake-token" + }, + { + "filePath": "samples/config.env", + "lineStart": 3, + "reason": "Sample config placeholder.", + "repoFullName": "synthetic-org/verifier-harness", + "ruleId": "synthetic-fake-token" + }, + { + "filePath": "test/fixtures/creds.json", + "lineStart": 2, + "reason": "Test fixture credentials (synthetic).", + "repoFullName": "synthetic-org/verifier-harness", + "ruleId": "synthetic-api-key" + }, + { + "filePath": "tests/test_login.py", + "lineStart": 30, + "reason": "Test asserts against a fake token.", + "repoFullName": "synthetic-org/verifier-harness", + "ruleId": "synthetic-fake-token" + }, + { + "filePath": "config/legacy.env", + "lineStart": 8, + "reason": "Legacy placeholder token left in config; not live.", + "repoFullName": "synthetic-org/verifier-harness", + "ruleId": "synthetic-fake-token" + }, + { + "filePath": "config/template.toml", + "lineStart": 6, + "reason": "Template token placeholder.", + "repoFullName": "synthetic-org/verifier-harness", + "ruleId": "synthetic-api-key" + }, + { + "filePath": "src/utils/format.py", + "lineStart": 15, + "reason": "Public sample constant shaped like a token.", + "repoFullName": "synthetic-org/verifier-harness", + "ruleId": "synthetic-fake-token" + }, + { + "filePath": "internal/consts.go", + "lineStart": 3, + "reason": "Documented constant, not a credential.", + "repoFullName": "synthetic-org/verifier-harness", + "ruleId": "synthetic-fake-token" + }, + { + "filePath": "data/blob.bin", + "lineStart": 1, + "reason": "Opaque blob, role not specific.", + "repoFullName": "synthetic-org/verifier-harness", + "ruleId": "synthetic-fake-token" + }, + { + "filePath": "Makefile", + "lineStart": 9, + "reason": "Build file reference, role not specific.", + "repoFullName": "synthetic-org/verifier-harness", + "ruleId": "synthetic-fake-token" + } + ], + "name": "synthetic-verifier-harness-v1", + "schemaVersion": 1 +} diff --git a/eval/verifier-corpus/harness/recorded-ideal.json b/eval/verifier-corpus/harness/recorded-ideal.json new file mode 100644 index 0000000..7552c3d --- /dev/null +++ b/eval/verifier-corpus/harness/recorded-ideal.json @@ -0,0 +1,115 @@ +{ + "name": "ideal-perfect-model-v1", + "responses": { + "finding_02cc46475bfdd191": { + "confidence": 0.95, + "label": "false_positive", + "reason": "API reference text fixture." + }, + "finding_223611539975a65b": { + "confidence": 0.95, + "label": "false_positive", + "reason": "Opaque blob, role not specific." + }, + "finding_24e40c8b470bd6a3": { + "confidence": 0.95, + "label": "true_positive", + "reason": "Synthetic TP case." + }, + "finding_2ee79410cd93d57c": { + "confidence": 0.95, + "label": "true_positive", + "reason": "Synthetic TP case." + }, + "finding_34de71c589210c56": { + "confidence": 0.95, + "label": "false_positive", + "reason": "Build file reference, role not specific." + }, + "finding_3fae230b287cbfb8": { + "confidence": 0.95, + "label": "false_positive", + "reason": "Documentation shows a synthetic token-shaped example." + }, + "finding_40ee8887f222fbd0": { + "confidence": 0.95, + "label": "false_positive", + "reason": "Public sample constant shaped like a token." + }, + "finding_5edd52c83c9d4c88": { + "confidence": 0.95, + "label": "false_positive", + "reason": "Template token placeholder." + }, + "finding_5fa7b574d009b3ed": { + "confidence": 0.95, + "label": "false_positive", + "reason": "Sample config placeholder." + }, + "finding_629466f031c50d61": { + "confidence": 0.95, + "label": "false_positive", + "reason": "README sample value, not a real credential." + }, + "finding_9a09f843316d378e": { + "confidence": 0.95, + "label": "false_positive", + "reason": "Legacy placeholder token left in config; not live." + }, + "finding_a46388d7dd53fd56": { + "confidence": 0.95, + "label": "false_positive", + "reason": "Quickstart example placeholder." + }, + "finding_a678f78485e50ab0": { + "confidence": 0.95, + "label": "true_positive", + "reason": "Synthetic TP case." + }, + "finding_aa1515e56f8c78a2": { + "confidence": 0.95, + "label": "true_positive", + "reason": "Synthetic TP case." + }, + "finding_aae8718fae0433dd": { + "confidence": 0.95, + "label": "true_positive", + "reason": "Synthetic TP case." + }, + "finding_cdb1a4dec3338070": { + "confidence": 0.95, + "label": "false_positive", + "reason": "Test asserts against a fake token." + }, + "finding_e634b9be1f9231d8": { + "confidence": 0.95, + "label": "false_positive", + "reason": "Documented constant, not a credential." + }, + "finding_ea73afe698f3d962": { + "confidence": 0.95, + "label": "false_positive", + "reason": "Test fixture credentials (synthetic)." + }, + "finding_eee7d32f875d1e7e": { + "confidence": 0.95, + "label": "true_positive", + "reason": "Synthetic TP case." + }, + "finding_f5629a9839992c40": { + "confidence": 0.95, + "label": "false_positive", + "reason": "Setup guide illustrates a fake token." + }, + "finding_ff80007ea24ebd40": { + "confidence": 0.95, + "label": "true_positive", + "reason": "Synthetic TP case." + }, + "finding_ffd80950f49a3db2": { + "confidence": 0.95, + "label": "true_positive", + "reason": "Synthetic TP case." + } + } +} diff --git a/src/security_scanner/core/evaluation/__init__.py b/src/security_scanner/core/evaluation/__init__.py index 14fa951..b269d18 100644 --- a/src/security_scanner/core/evaluation/__init__.py +++ b/src/security_scanner/core/evaluation/__init__.py @@ -19,6 +19,18 @@ render_evaluation_report, render_verifier_delta_report, ) +from security_scanner.core.evaluation.verifier_corpus import ( + CORPUS_CASES, + build_corpus_candidates, + build_evaluation_corpus, + build_ideal_responses, +) +from security_scanner.core.evaluation.verifier_harness import ( + HeuristicVerifierStrategy, + RecordedVerifierStrategy, + run_corpus_delta, + verify_candidates, +) __all__ = [ "EvaluationCorpus", @@ -34,4 +46,12 @@ "load_evaluation_corpus", "render_evaluation_report", "render_verifier_delta_report", + "CORPUS_CASES", + "build_corpus_candidates", + "build_evaluation_corpus", + "build_ideal_responses", + "HeuristicVerifierStrategy", + "RecordedVerifierStrategy", + "run_corpus_delta", + "verify_candidates", ] diff --git a/src/security_scanner/core/evaluation/verifier_corpus.py b/src/security_scanner/core/evaluation/verifier_corpus.py new file mode 100644 index 0000000..e58f083 --- /dev/null +++ b/src/security_scanner/core/evaluation/verifier_corpus.py @@ -0,0 +1,352 @@ +"""Synthetic, finding-level verifier corpus (FR2). + +A single source of truth (:data:`CORPUS_CASES`) deterministically derives three +committed artifacts under ``eval/verifier-corpus/harness/``: + +- ``candidates.jsonl`` — the scanner "before" set (every case as a Finding). +- ``expected.json`` — ground truth in the ``load_evaluation_corpus`` schema. +- ``recorded-ideal.json`` — a perfect-model recorded baseline (drop-in for the + recorded strategy; replaced by real model output on box return, FR8/S2). + +All values are synthetic and public-safe (I3/I6): no raw secret, repo, or path +that maps to a real credential. The "secrets" are ``SCANNER_FAKE_SECRET_TOKEN_*`` +markers that only the synthetic gitleaks rule recognises. + +This corpus is finding-level and decoupled from gitleaks; it does NOT replace +the gitleaks-scannable 2-case checkout used by the live 3-step workflow. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass +from pathlib import Path + +from security_scanner.core.evaluation.metrics import ( + EvaluationCorpus, + ExpectedFinding, +) +from security_scanner.core.finding.model import Finding + +REPO = "synthetic-org/verifier-harness" +SCAN_RUN_ID = "scan_harness" +RULE_PACK_VERSION = "secret-rules-0.1.0" + +TP = "TP" +FP = "FP" + + +@dataclass(frozen=True) +class CorpusCase: + """One synthetic verifier case. ``label`` is the ground-truth verdict.""" + + file_path: str + line_start: int + rule_id: str + label: str # TP | FP + secret_serial: int + expected_role: str # ground-truth path-role classification (FR4 anchor) + reason: str = "" + + @property + def raw_secret(self) -> str: + return f"SCANNER_FAKE_SECRET_TOKEN_{self.secret_serial:06d}" + + +# Single source of truth. Coverage: 6 path-roles x TP/FP where realistic. +# documentation/example/test carry FP cases only (a real secret in those paths +# would defeat the coarse path-role anchor by design — that asymmetry IS the +# anchor's assumption). configuration/source carry both TP (real candidate) and +# FP (committed placeholder the anchor cannot distinguish). other carries FP +# that the anchor leaves as needs_review. +CORPUS_CASES: tuple[CorpusCase, ...] = ( + # --- TP: configuration --- + CorpusCase( + "config/positive.env", 4, "synthetic-fake-token", TP, 100001, "configuration" + ), + CorpusCase( + "config/database.env", 3, "synthetic-fake-token", TP, 100002, "configuration" + ), + CorpusCase( + "config/settings.toml", 10, "synthetic-api-key", TP, 100003, "configuration" + ), + CorpusCase( + "settings/prod.yaml", 5, "synthetic-fake-token", TP, 100004, "configuration" + ), + # --- TP: source --- + CorpusCase("src/app/secrets.py", 42, "synthetic-fake-token", TP, 100005, "source"), + CorpusCase("internal/auth.go", 18, "synthetic-api-key", TP, 100006, "source"), + CorpusCase("lib/client.rb", 7, "synthetic-fake-token", TP, 100007, "source"), + CorpusCase("services/payment.ts", 25, "synthetic-fake-token", TP, 100008, "source"), + # --- FP: documentation (anchor clears) --- + CorpusCase( + "docs/example.md", + 5, + "synthetic-fake-token", + FP, + 900001, + "documentation", + "Documentation shows a synthetic token-shaped example.", + ), + CorpusCase( + "docs/setup.md", + 12, + "synthetic-fake-token", + FP, + 900002, + "documentation", + "Setup guide illustrates a fake token.", + ), + CorpusCase( + "README.rst", + 4, + "synthetic-fake-token", + FP, + 900003, + "documentation", + "README sample value, not a real credential.", + ), + CorpusCase( + "docs/api/reference.txt", + 9, + "synthetic-api-key", + FP, + 900004, + "documentation", + "API reference text fixture.", + ), + # --- FP: example (anchor clears) --- + CorpusCase( + "examples/quickstart.py", + 6, + "synthetic-fake-token", + FP, + 900005, + "example", + "Quickstart example placeholder.", + ), + CorpusCase( + "samples/config.env", + 3, + "synthetic-fake-token", + FP, + 900006, + "example", + "Sample config placeholder.", + ), + CorpusCase( + "test/fixtures/creds.json", + 2, + "synthetic-api-key", + FP, + 900007, + "example", + "Test fixture credentials (synthetic).", + ), + # --- FP: test (anchor clears) --- + CorpusCase( + "tests/test_login.py", + 30, + "synthetic-fake-token", + FP, + 900008, + "test", + "Test asserts against a fake token.", + ), + # --- FP: configuration (anchor cannot distinguish; survives as FP) --- + CorpusCase( + "config/legacy.env", + 8, + "synthetic-fake-token", + FP, + 900009, + "configuration", + "Legacy placeholder token left in config; not live.", + ), + CorpusCase( + "config/template.toml", + 6, + "synthetic-api-key", + FP, + 900010, + "configuration", + "Template token placeholder.", + ), + # --- FP: source (anchor cannot distinguish; survives as FP) --- + CorpusCase( + "src/utils/format.py", + 15, + "synthetic-fake-token", + FP, + 900011, + "source", + "Public sample constant shaped like a token.", + ), + CorpusCase( + "internal/consts.go", + 3, + "synthetic-fake-token", + FP, + 900012, + "source", + "Documented constant, not a credential.", + ), + # --- FP: other (anchor leaves needs_review; survives) --- + CorpusCase( + "data/blob.bin", + 1, + "synthetic-fake-token", + FP, + 900013, + "other", + "Opaque blob, role not specific.", + ), + CorpusCase( + "Makefile", + 9, + "synthetic-fake-token", + FP, + 900014, + "other", + "Build file reference, role not specific.", + ), +) + + +def _finding_for(case: CorpusCase) -> Finding: + return Finding.create( + repo_full_name=REPO, + rule_id=case.rule_id, + file_path=case.file_path, + line_start=case.line_start, + raw_secret=case.raw_secret, + source_tool="gitleaks", + scan_run_id=SCAN_RUN_ID, + rule_pack_version=RULE_PACK_VERSION, + ) + + +def build_corpus_candidates() -> list[Finding]: + """Build the scanner 'before' candidate set (every case).""" + return [_finding_for(case) for case in CORPUS_CASES] + + +def build_expected_dict() -> dict: + """Build the ground-truth dict in the load_evaluation_corpus schema.""" + expected = [ + { + "repoFullName": REPO, + "filePath": case.file_path, + "lineStart": case.line_start, + "ruleId": case.rule_id, + } + for case in CORPUS_CASES + if case.label == TP + ] + known_negatives = [ + { + "repoFullName": REPO, + "filePath": case.file_path, + "lineStart": case.line_start, + "ruleId": case.rule_id, + "reason": case.reason, + } + for case in CORPUS_CASES + if case.label == FP + ] + return { + "schemaVersion": 1, + "name": "synthetic-verifier-harness-v1", + "description": ( + "Public-safe finding-level verifier corpus: 6 path-roles x TP/FP for " + "infra-free before/after accuracy measurement." + ), + "expectedFindings": expected, + "knownNegatives": known_negatives, + } + + +def build_evaluation_corpus() -> EvaluationCorpus: + """Build the EvaluationCorpus ground truth directly from CORPUS_CASES.""" + data = build_expected_dict() + return EvaluationCorpus( + schema_version=data["schemaVersion"], + name=data["name"], + expected_findings=[ + ExpectedFinding.from_dict(item) for item in data["expectedFindings"] + ], + known_negative_count=len(data["knownNegatives"]), + ) + + +def build_ideal_responses(candidates: list[Finding] | None = None) -> dict: + """Build a perfect-model recorded baseline keyed by finding_id. + + TP cases -> true_positive; FP cases -> false_positive. Confidence 0.95 so it + clears any reasonable min_confidence threshold. + """ + candidates = candidates or build_corpus_candidates() + by_id = {finding.finding_id: finding for finding in candidates} + responses: dict[str, dict] = {} + for case, finding in zip(CORPUS_CASES, build_corpus_candidates()): + # finding ids are deterministic; align case label to its finding. + fid = finding.finding_id + if fid not in by_id: + continue + label = "true_positive" if case.label == TP else "false_positive" + responses[fid] = { + "label": label, + "confidence": 0.95, + "reason": case.reason or f"Synthetic {case.label} case.", + } + return {"name": "ideal-perfect-model-v1", "responses": responses} + + +def default_harness_dir() -> Path: + return Path(__file__).resolve().parents[4] / "eval" / "verifier-corpus" / "harness" + + +def write_corpus(base_dir: Path | str | None = None) -> dict[str, Path]: + """Write candidates.jsonl, expected.json, recorded-ideal.json to *base_dir*.""" + base = Path(base_dir) if base_dir is not None else default_harness_dir() + base.mkdir(parents=True, exist_ok=True) + candidates = build_corpus_candidates() + + candidates_path = base / "candidates.jsonl" + candidates_path.write_text( + "".join( + json.dumps(finding.to_dict(), ensure_ascii=False, sort_keys=True) + "\n" + for finding in candidates + ), + encoding="utf-8", + ) + + expected_path = base / "expected.json" + expected_path.write_text( + json.dumps(build_expected_dict(), ensure_ascii=False, indent=2, sort_keys=True) + + "\n", + encoding="utf-8", + ) + + recorded_path = base / "recorded-ideal.json" + recorded_path.write_text( + json.dumps( + build_ideal_responses(candidates), + ensure_ascii=False, + indent=2, + sort_keys=True, + ) + + "\n", + encoding="utf-8", + ) + return { + "candidates": candidates_path, + "expected": expected_path, + "recorded": recorded_path, + } + + +if __name__ == "__main__": # pragma: no cover - reproducible generation entrypoint + written = write_corpus() + for name, path in written.items(): + print(f"wrote {name}: {path}") diff --git a/src/security_scanner/core/evaluation/verifier_harness.py b/src/security_scanner/core/evaluation/verifier_harness.py new file mode 100644 index 0000000..d3d50c5 --- /dev/null +++ b/src/security_scanner/core/evaluation/verifier_harness.py @@ -0,0 +1,106 @@ +"""Infra-free verifier accuracy harness (FR1). + +Converts corpus candidate findings into verified findings via a pluggable +strategy, then measures the before/after delta with the existing +``core.evaluation.metrics`` engine. No live model or network is required, so +prompt/anchor/threshold changes get reproducible numbers without the Tailscale +box (HANDOFF §1 OUT / §5 권장 2). + +Two strategies: + +- :class:`HeuristicVerifierStrategy` — the deterministic path-role anchor + (the same ``path_role_decision`` the live prompt uses), gated by + ``min_confidence``. Measures anchor quality with zero external dependency. +- :class:`RecordedVerifierStrategy` — pre-recorded model responses keyed by + finding_id, gated through the identical ``parse_verifier_response`` path. + Drives threshold calibration and is the drop-in for a real model run. +""" + +from __future__ import annotations + +import json +from collections.abc import Mapping +from dataclasses import dataclass +from pathlib import Path + +from security_scanner.core.evaluation.metrics import ( + EvaluationCorpus, + EvaluationThresholds, + VerifierDeltaResult, + evaluate_verifier_delta, +) +from security_scanner.core.finding.model import Finding +from security_scanner.llm.common.prompt import _path_role, path_role_decision +from security_scanner.llm.common.verifier import ( + VerifierResult, + apply_verifier_result, + parse_verifier_response, +) + + +@dataclass(frozen=True) +class HeuristicVerifierStrategy: + """Deterministic path-role anchor strategy (no model, no network).""" + + anchors: Mapping[str, Mapping[str, object]] | None = None + min_confidence: float = 0.60 + + def verify(self, finding: Finding) -> VerifierResult: + role = _path_role(finding.location.file_path) + label, confidence, reason = path_role_decision(role, self.anchors) + # Reuse the exact fail-closed gating of the live verify path so the + # harness and production share one threshold semantics. + return parse_verifier_response( + json.dumps({"label": label, "confidence": confidence, "reason": reason}), + min_confidence=self.min_confidence, + ) + + +@dataclass(frozen=True) +class RecordedVerifierStrategy: + """Replay pre-recorded model responses keyed by finding_id.""" + + responses: Mapping[str, Mapping[str, object]] + min_confidence: float = 0.60 + + def verify(self, finding: Finding) -> VerifierResult: + record = self.responses.get(finding.finding_id) + if record is None: + # Fail-closed (I2 spirit): an unrecorded finding is not cleared. + return VerifierResult.needs_review("No recorded response for finding.") + return parse_verifier_response( + json.dumps(dict(record)), + min_confidence=self.min_confidence, + ) + + @classmethod + def from_file( + cls, path: str | Path, *, min_confidence: float = 0.60 + ) -> "RecordedVerifierStrategy": + data = json.loads(Path(path).read_text(encoding="utf-8")) + responses = data.get("responses", data) if isinstance(data, Mapping) else data + return cls(responses=responses, min_confidence=min_confidence) + + +def verify_candidates(candidates: list[Finding], strategy) -> list[Finding]: + """Apply *strategy* to each candidate, returning verified (triaged) findings.""" + return [ + apply_verifier_result(finding, strategy.verify(finding)) + for finding in candidates + ] + + +def run_corpus_delta( + corpus: EvaluationCorpus, + candidates: list[Finding], + strategy, + thresholds: EvaluationThresholds | None = None, +) -> VerifierDeltaResult: + """Measure before/after verifier accuracy on a corpus (reuses metrics.py).""" + verified = verify_candidates(candidates, strategy) + return evaluate_verifier_delta( + corpus.expected_findings, + candidates, + verified, + thresholds, + ) diff --git a/src/security_scanner/llm/common/prompt.py b/src/security_scanner/llm/common/prompt.py index 92efe68..0331582 100644 --- a/src/security_scanner/llm/common/prompt.py +++ b/src/security_scanner/llm/common/prompt.py @@ -4,16 +4,64 @@ import hashlib import json +from collections.abc import Mapping from pathlib import PurePath from security_scanner.core.finding.model import Finding - -def build_redacted_prompt(finding: Finding) -> str: +# Path-role anchors steer the verifier's confidence anchoring without ever +# touching raw secrets/paths/repos (I3-safe: label/confidence/reason metadata +# only). The default table reproduces the historical hardcoded behaviour; a +# yaml `verification.path_role_anchors` override is purely additive. +_FALSE_POSITIVE_REASON = ( + "documentation/example/test location is a likely non-production example" +) +_TRUE_POSITIVE_REASON = ( + "configuration/source location is a likely real secret candidate" +) +_OTHER_REASON = "path role is not specific enough" + +DEFAULT_PATH_ROLE_ANCHORS: Mapping[str, Mapping[str, object]] = { + "documentation": { + "label": "false_positive", + "confidence": 0.80, + "reason": _FALSE_POSITIVE_REASON, + }, + "example": { + "label": "false_positive", + "confidence": 0.80, + "reason": _FALSE_POSITIVE_REASON, + }, + "test": { + "label": "false_positive", + "confidence": 0.80, + "reason": _FALSE_POSITIVE_REASON, + }, + "configuration": { + "label": "true_positive", + "confidence": 0.80, + "reason": _TRUE_POSITIVE_REASON, + }, + "source": { + "label": "true_positive", + "confidence": 0.80, + "reason": _TRUE_POSITIVE_REASON, + }, + "other": {"label": "needs_review", "confidence": 0.61, "reason": _OTHER_REASON}, +} + + +def build_redacted_prompt( + finding: Finding, + *, + anchors: Mapping[str, Mapping[str, object]] | None = None, +) -> str: """Build a strict verifier prompt from redacted finding metadata only.""" path_role = _path_role(finding.location.file_path) file_extension = _file_extension(finding.location.file_path) - matched_label, matched_confidence, matched_reason = _path_role_decision(path_role) + matched_label, matched_confidence, matched_reason = path_role_decision( + path_role, anchors + ) metadata = { "findingId": finding.finding_id, "category": finding.category, @@ -65,7 +113,11 @@ def _fingerprint(value: str) -> str: def _path_kind(file_path: str) -> str: - return "absolute-redacted" if PurePath(file_path).is_absolute() else "relative-redacted" + return ( + "absolute-redacted" + if PurePath(file_path).is_absolute() + else "relative-redacted" + ) def _file_extension(file_path: str) -> str: @@ -98,17 +150,21 @@ def _path_role(file_path: str) -> str: return "other" -def _path_role_decision(path_role: str) -> tuple[str, float, str]: - if path_role in {"documentation", "example", "test"}: - return ( - "false_positive", - 0.80, - "documentation/example/test location is a likely non-production example", - ) - if path_role in {"configuration", "source"}: - return ( - "true_positive", - 0.80, - "configuration/source location is a likely real secret candidate", - ) - return ("needs_review", 0.61, "path role is not specific enough") +def path_role_decision( + path_role: str, + anchors: Mapping[str, Mapping[str, object]] | None = None, +) -> tuple[str, float, str]: + """Resolve the (label, confidence, reason) anchor for a path role. + + Falls back to the ``other`` anchor for unknown roles. With ``anchors=None`` + the result is byte-identical to the historical hardcoded decision. + """ + table = anchors or DEFAULT_PATH_ROLE_ANCHORS + anchor = ( + table.get(path_role) or table.get("other") or DEFAULT_PATH_ROLE_ANCHORS["other"] + ) + return ( + str(anchor["label"]), + float(anchor["confidence"]), + str(anchor["reason"]), + ) diff --git a/src/security_scanner/llm/common/verifier.py b/src/security_scanner/llm/common/verifier.py index c639be1..bebe1de 100644 --- a/src/security_scanner/llm/common/verifier.py +++ b/src/security_scanner/llm/common/verifier.py @@ -4,7 +4,8 @@ import json import re -from dataclasses import dataclass +from collections.abc import Mapping +from dataclasses import dataclass, field from security_scanner.core.finding.model import Finding, Verdict @@ -34,6 +35,12 @@ class VerifierConfig: timeout_seconds: float = 30.0 min_confidence: float = 0.60 api_key_env: str | None = None + # Optional per-role anchor overrides (label/confidence/reason metadata only, + # I3-safe). None preserves the default prompt behaviour byte-for-byte. + # Excluded from eq/hash so a dict field never breaks the frozen dataclass. + path_role_anchors: Mapping[str, Mapping[str, object]] | None = field( + default=None, compare=False, hash=False + ) @dataclass(frozen=True) @@ -63,7 +70,9 @@ def needs_review( ) -def parse_verifier_response(raw_content: str, *, min_confidence: float) -> VerifierResult: +def parse_verifier_response( + raw_content: str, *, min_confidence: float +) -> VerifierResult: """Validate a strict JSON verifier response, fail-closed on any ambiguity.""" try: data = json.loads(raw_content) diff --git a/src/security_scanner/llm/ollama/client.py b/src/security_scanner/llm/ollama/client.py index 158f03d..388b46d 100644 --- a/src/security_scanner/llm/ollama/client.py +++ b/src/security_scanner/llm/ollama/client.py @@ -44,7 +44,9 @@ class OllamaChatVerifier: """Verifier that sends redacted prompts to an Ollama-compatible chat API.""" - def __init__(self, config: VerifierConfig, transport: Transport | None = None) -> None: + def __init__( + self, config: VerifierConfig, transport: Transport | None = None + ) -> None: self.config = config self._transport = transport or self._default_transport @@ -63,7 +65,12 @@ def verify(self, finding: Finding) -> VerifierResult: "Return only strict JSON." ), }, - {"role": "user", "content": build_redacted_prompt(finding)}, + { + "role": "user", + "content": build_redacted_prompt( + finding, anchors=self.config.path_role_anchors + ), + }, ], } try: diff --git a/src/security_scanner/runtime/verify_artifact.py b/src/security_scanner/runtime/verify_artifact.py index 497697e..1b895fe 100644 --- a/src/security_scanner/runtime/verify_artifact.py +++ b/src/security_scanner/runtime/verify_artifact.py @@ -3,7 +3,7 @@ from __future__ import annotations import os -from collections.abc import Callable +from collections.abc import Callable, Mapping from dataclasses import dataclass from pathlib import Path from typing import Protocol @@ -11,6 +11,7 @@ import yaml from security_scanner.core.finding.model import Finding, Status, Verdict +from security_scanner.llm.common.prompt import DEFAULT_PATH_ROLE_ANCHORS from security_scanner.llm.common.verifier import ( VerifierConfig, VerifierResult, @@ -22,6 +23,8 @@ EnvLookup = Callable[[str], str | None] +_ALLOWED_ANCHOR_LABELS = {"true_positive", "false_positive", "needs_review"} + class FindingVerifier(Protocol): def verify(self, finding: Finding) -> VerifierResult: @@ -119,12 +122,16 @@ def resolve_verifier_config( api_key_env = ollama_config.get("api_key_env") or env_lookup( "SECURITY_SCANNER_OLLAMA_API_KEY_ENV" ) + path_role_anchors = _resolve_path_role_anchors( + verification_config.get("path_role_anchors") + ) return VerifierConfig( host=str(host), model=str(model), timeout_seconds=timeout_seconds, min_confidence=min_confidence, api_key_env=str(api_key_env) if api_key_env else None, + path_role_anchors=path_role_anchors, ) @@ -249,6 +256,52 @@ def _status_for_verdict(verdict: str) -> str | None: return disposition_status_for_verdict(verdict) +def _resolve_path_role_anchors( + raw: object, +) -> dict[str, dict[str, object]] | None: + """Validate and merge yaml path-role anchor overrides onto the defaults. + + Returns ``None`` when no override is present so the prompt path stays + byte-identical to the historical default. Partial overrides are merged per + role onto :data:`DEFAULT_PATH_ROLE_ANCHORS` so unspecified roles keep their + default anchor. Raises ``ValueError`` on malformed entries (I3-safe: only + label/confidence/reason metadata is accepted). + """ + if raw is None: + return None + if not isinstance(raw, Mapping): + raise ValueError("verification.path_role_anchors must be a mapping") + + merged = {role: dict(anchor) for role, anchor in DEFAULT_PATH_ROLE_ANCHORS.items()} + for role, anchor in raw.items(): + if not isinstance(anchor, Mapping): + raise ValueError(f"path_role_anchors[{role}] must be a mapping") + label = str(anchor.get("label", "")).strip().lower() + if label not in _ALLOWED_ANCHOR_LABELS: + raise ValueError( + f"path_role_anchors[{role}].label must be one of " + f"{sorted(_ALLOWED_ANCHOR_LABELS)}" + ) + try: + confidence = float(anchor.get("confidence")) + except (TypeError, ValueError) as exc: + raise ValueError( + f"path_role_anchors[{role}].confidence must be a number" + ) from exc + if not 0.0 <= confidence <= 1.0: + raise ValueError( + f"path_role_anchors[{role}].confidence must be between 0 and 1" + ) + reason = str( + anchor.get("reason") + or DEFAULT_PATH_ROLE_ANCHORS.get(str(role), {}).get( + "reason", "path role override" + ) + ) + merged[str(role)] = {"label": label, "confidence": confidence, "reason": reason} + return merged + + def _load_verifier_config(path: str | Path) -> dict: data = yaml.safe_load(Path(path).read_text(encoding="utf-8")) or {} if not isinstance(data, dict): diff --git a/tests/test_path_role_anchors.py b/tests/test_path_role_anchors.py new file mode 100644 index 0000000..b4b28a8 --- /dev/null +++ b/tests/test_path_role_anchors.py @@ -0,0 +1,179 @@ +"""Path-role classification (FR4) and configurable anchors (FR5). + +FR4 parametrizes the 6 path-roles plus edge cases that were previously only +spot-checked. FR5 proves yaml overrides are additive and never weaken the +default (byte-identical) prompt behaviour, preserving I3. +""" + +from __future__ import annotations + +import textwrap + +import pytest + +from security_scanner.core.evaluation.verifier_corpus import CORPUS_CASES +from security_scanner.core.finding.model import Finding, GitleaksFindingPayload +from security_scanner.llm.common.prompt import ( + DEFAULT_PATH_ROLE_ANCHORS, + _path_role, + build_redacted_prompt, + path_role_decision, +) +from security_scanner.runtime.verify_artifact import ( + VerifierConfigRequest, + resolve_verifier_config, +) + + +def _finding(file_path: str) -> Finding: + return Finding.create( + repo_full_name="synthetic-org/anchor-repo", + rule_id="synthetic-fake-token", + file_path=file_path, + line_start=3, + raw_secret="SCANNER_FAKE_SECRET_TOKEN_424242", + source_tool="gitleaks", + scan_run_id="scan_anchor", + rule_pack_version="secret-rules-0.1.0", + gitleaks=GitleaksFindingPayload( + rule_id="synthetic-fake-token", + file=file_path, + start_line=3, + secret="SCANNER_FAKE_SECRET_TOKEN_424242", + match="token=SCANNER_FAKE_SECRET_TOKEN_424242", + fingerprint="synthetic", + ), + ) + + +@pytest.mark.parametrize( + "file_path, expected_role", + [ + ("docs/setup.md", "documentation"), + ("README.rst", "documentation"), + ("notes/info.txt", "documentation"), + ("documentation/guide.adoc", "documentation"), # dir wins over unknown suffix + ("examples/quickstart.py", "example"), # example dir wins over .py source + ("samples/config.env", "example"), + ("test/fixtures/creds.json", "example"), # fixtures wins over .json config + ("tests/test_login.py", "test"), + ("src/test_helpers.py", "test"), # name startswith test_ + ("config/app.env", "configuration"), + ("settings/prod.yaml", "configuration"), + ("deploy/values.toml", "configuration"), + ("src/app/secrets.py", "source"), + ("internal/auth.go", "source"), + ("lib/client.rb", "source"), + ("data/blob.bin", "other"), + ("Makefile", "other"), + ], +) +def test_path_role_classification(file_path, expected_role): + assert _path_role(file_path) == expected_role + + +def test_every_corpus_case_classifies_to_its_declared_role(): + for case in CORPUS_CASES: + assert _path_role(case.file_path) == case.expected_role + + +@pytest.mark.parametrize( + "role, label, confidence", + [ + ("documentation", "false_positive", 0.80), + ("example", "false_positive", 0.80), + ("test", "false_positive", 0.80), + ("configuration", "true_positive", 0.80), + ("source", "true_positive", 0.80), + ("other", "needs_review", 0.61), + ("totally-unknown-role", "needs_review", 0.61), # falls back to other + ], +) +def test_default_anchor_decision(role, label, confidence): + got_label, got_conf, _ = path_role_decision(role) + assert got_label == label + assert got_conf == pytest.approx(confidence) + + +def test_default_anchors_keep_prompt_byte_identical(): + finding = _finding("docs/sample.md") + assert build_redacted_prompt(finding) == build_redacted_prompt( + finding, anchors=None + ) + assert build_redacted_prompt(finding) == build_redacted_prompt( + finding, anchors=DEFAULT_PATH_ROLE_ANCHORS + ) + + +def _write_config(tmp_path, body: str): + path = tmp_path / "verifier.yaml" + path.write_text(textwrap.dedent(body), encoding="utf-8") + return VerifierConfigRequest(config_path=path) + + +def test_yaml_anchor_override_is_merged_and_applied(tmp_path): + request = _write_config( + tmp_path, + """ + ollama: + host: http://ollama.test + model: test-model + verification: + path_role_anchors: + documentation: + label: true_positive + confidence: 0.95 + reason: org policy treats docs as real + """, + ) + + config = resolve_verifier_config(request, env_lookup=lambda _name: None) + + # Overridden role reflects the new anchor... + assert config.path_role_anchors["documentation"]["label"] == "true_positive" + # ...while unspecified roles keep their defaults (partial merge). + assert config.path_role_anchors["configuration"]["label"] == "true_positive" + assert config.path_role_anchors["other"]["label"] == "needs_review" + + prompt = build_redacted_prompt( + _finding("docs/readme.md"), anchors=config.path_role_anchors + ) + assert "Current finding matched label: true_positive." in prompt + + +def test_no_yaml_override_resolves_to_none(tmp_path): + request = _write_config( + tmp_path, + """ + ollama: + host: http://ollama.test + model: test-model + """, + ) + config = resolve_verifier_config(request, env_lookup=lambda _name: None) + assert config.path_role_anchors is None + + +@pytest.mark.parametrize( + "anchor_body", + [ + "label: garbage\n confidence: 0.5", # invalid label + "label: true_positive\n confidence: 2.0", # out of range + "label: true_positive\n confidence: not-a-number", # non-numeric + ], +) +def test_invalid_yaml_anchor_raises(tmp_path, anchor_body): + request = _write_config( + tmp_path, + f""" + ollama: + host: http://ollama.test + model: test-model + verification: + path_role_anchors: + documentation: + {anchor_body} + """, + ) + with pytest.raises(ValueError): + resolve_verifier_config(request, env_lookup=lambda _name: None) diff --git a/tests/test_verifier_confidence_sweep.py b/tests/test_verifier_confidence_sweep.py new file mode 100644 index 0000000..6d0a7f4 --- /dev/null +++ b/tests/test_verifier_confidence_sweep.py @@ -0,0 +1,62 @@ +"""min_confidence calibration sweep over the synthetic corpus (FR6). + +The default 0.60 threshold had no corpus-level justification (HANDOFF §5 +MISSING #6). This sweep measures the precision/recall trade-off across the +threshold range and locks the calibration conclusion: + + - Recall is preserved at every threshold (no expected TP is ever cleared, + because TP anchors at 0.80 only downgrade to needs_review above 0.80 and a + needs_review verdict is still not cleared). + - False-positive reduction is maximal on the plateau [0.0, 0.80] and collapses + to zero once the threshold passes the 0.80 anchor cliff. + - Therefore 0.60 sits safely inside the max-FP-reduction plateau, below the + 0.80 cliff, and above the 0.61 `other` anchor — the chosen calibration. +""" + +from __future__ import annotations + +from security_scanner.core.evaluation import ( + HeuristicVerifierStrategy, + build_corpus_candidates, + build_evaluation_corpus, + run_corpus_delta, +) + +SWEEP = [0.50, 0.60, 0.70, 0.80, 0.81, 0.90] + + +def _fp_reduction_at(threshold: float) -> int: + corpus = build_evaluation_corpus() + candidates = build_corpus_candidates() + delta = run_corpus_delta( + corpus, candidates, HeuristicVerifierStrategy(min_confidence=threshold) + ) + assert delta.recall_preserved is True # recall preserved across the whole sweep + assert delta.after.false_negative_count == 0 + return delta.false_positive_reduction + + +def test_sweep_recall_preserved_and_plateau_then_cliff(): + reductions = {thr: _fp_reduction_at(thr) for thr in SWEEP} + + # Plateau: every threshold up to the 0.80 anchor clears the same 8 FPs. + for thr in (0.50, 0.60, 0.70, 0.80): + assert reductions[thr] == 8 + # Cliff: above 0.80 the anchor confidence no longer clears anything. + for thr in (0.81, 0.90): + assert reductions[thr] == 0 + + +def test_fp_reduction_is_monotonic_non_increasing(): + ordered = sorted(SWEEP) + reductions = [_fp_reduction_at(thr) for thr in ordered] + assert reductions == sorted(reductions, reverse=True) + # Load-bearing: a real cliff must exist (not a flat/all-zero sequence that + # would also satisfy the non-increasing check above). + assert reductions[0] > reductions[-1] + + +def test_default_threshold_is_inside_the_optimal_plateau(): + # 0.60 is the configured default (verifier.py); it must yield max reduction. + plateau_max = max(_fp_reduction_at(thr) for thr in SWEEP) + assert _fp_reduction_at(0.60) == plateau_max diff --git a/tests/test_verifier_harness.py b/tests/test_verifier_harness.py new file mode 100644 index 0000000..0a89932 --- /dev/null +++ b/tests/test_verifier_harness.py @@ -0,0 +1,145 @@ +"""Infra-free verifier accuracy harness tests (FR1, FR3, FR8). + +Locks the deterministic before/after baseline so any anchor/threshold change is +measured, not asserted by inspection. No live model or network is used. +""" + +from __future__ import annotations + +import json + +import pytest + +from security_scanner.core.evaluation import ( + EvaluationThresholds, + HeuristicVerifierStrategy, + RecordedVerifierStrategy, + build_corpus_candidates, + build_evaluation_corpus, + build_ideal_responses, + render_verifier_delta_report, + run_corpus_delta, +) +from security_scanner.core.evaluation.verifier_corpus import ( + CORPUS_CASES, + FP, + TP, + default_harness_dir, + write_corpus, +) + + +def test_corpus_is_large_enough_and_role_diverse(): + corpus = build_evaluation_corpus() + candidates = build_corpus_candidates() + + assert len(candidates) >= 20 + assert len(corpus.expected_findings) == sum( + 1 for c in CORPUS_CASES if c.label == TP + ) + assert corpus.known_negative_count == sum(1 for c in CORPUS_CASES if c.label == FP) + # All six path-roles are represented. + assert {c.expected_role for c in CORPUS_CASES} == { + "documentation", + "example", + "test", + "configuration", + "source", + "other", + } + + +def test_heuristic_baseline_preserves_recall_and_reduces_false_positives(): + """FR3 gate: the path-role anchor must reduce FPs without losing recall.""" + corpus = build_evaluation_corpus() + candidates = build_corpus_candidates() + + delta = run_corpus_delta( + corpus, candidates, HeuristicVerifierStrategy(min_confidence=0.60) + ) + + # Meaningful invariants (I2/I5 spirit): no expected TP is ever cleared. + assert delta.recall_preserved is True + assert delta.after.false_negative_count == 0 + assert delta.false_positive_reduction == 8 + # Locked baseline so a future anchor change is detected. + assert delta.before.precision == pytest.approx(8 / 22) + assert delta.after.precision == pytest.approx(8 / 14) + assert delta.after.recall == pytest.approx(1.0) + + +def test_recorded_ideal_baseline_passes_strict_gate(): + """FR8: a perfect recorded model clears every FP and keeps every TP.""" + corpus = build_evaluation_corpus() + candidates = build_corpus_candidates() + responses = build_ideal_responses(candidates)["responses"] + + delta = run_corpus_delta( + corpus, + candidates, + RecordedVerifierStrategy(responses, min_confidence=0.60), + EvaluationThresholds(false_negative_max=0, precision_min=0.90, recall_min=0.99), + ) + + assert delta.gate.passed is True + assert delta.after.precision == pytest.approx(1.0) + assert delta.false_positive_reduction == 14 + + +def test_recorded_strategy_fails_closed_for_unrecorded_finding(): + candidates = build_corpus_candidates() + strategy = RecordedVerifierStrategy({}, min_confidence=0.60) + + result = strategy.verify(candidates[0]) + + assert result.verdict == "NEEDS_REVIEW" + assert result.confidence == 0.0 + + +def test_recorded_strategy_from_file_roundtrip(tmp_path): + paths = write_corpus(tmp_path) + strategy = RecordedVerifierStrategy.from_file( + paths["recorded"], min_confidence=0.60 + ) + corpus = build_evaluation_corpus() + candidates = build_corpus_candidates() + + delta = run_corpus_delta(corpus, candidates, strategy) + + assert delta.after.precision == pytest.approx(1.0) + + +def test_committed_corpus_artifacts_are_fresh(tmp_path): + """Regenerating from CORPUS_CASES must match the committed artifacts.""" + regenerated = write_corpus(tmp_path) + committed_dir = default_harness_dir() + + for name, regen_path in regenerated.items(): + committed = (committed_dir / regen_path.name).read_text(encoding="utf-8") + assert committed == regen_path.read_text(encoding="utf-8"), ( + f"{regen_path.name} is stale; run " + "`python -m security_scanner.core.evaluation.verifier_corpus`" + ) + + +def test_corpus_and_report_are_public_safe(): + """I3/I6: no raw secret leaks into corpus artifacts or the rendered report.""" + candidates = build_corpus_candidates() + delta = run_corpus_delta( + build_evaluation_corpus(), candidates, HeuristicVerifierStrategy() + ) + report = render_verifier_delta_report(delta) + + committed_dir = default_harness_dir() + blobs = [report] + for name in ("candidates.jsonl", "expected.json", "recorded-ideal.json"): + blobs.append((committed_dir / name).read_text(encoding="utf-8")) + + for case in CORPUS_CASES: + for blob in blobs: + assert case.raw_secret not in blob + # Only salted hashes ever represent secrets in the candidate store. + candidates_text = (committed_dir / "candidates.jsonl").read_text(encoding="utf-8") + assert "SCANNER_FAKE_SECRET_TOKEN_" not in candidates_text + for line in candidates_text.splitlines(): + assert json.loads(line)["evidence"]["secretHash"].startswith("salted-sha256:") From e2f2c110f349de9ee587dbeaba2ad3f614d9f1c7 Mon Sep 17 00:00:00 2001 From: pureliture Date: Sat, 20 Jun 2026 08:34:26 +0900 Subject: [PATCH 2/2] refactor(verifier): address gemini-code-assist review comments (#45) - RecordedVerifierStrategy.from_file: fail-closed ValueError on non-mapping JSON (avoids later AttributeError) + parametrized type-validation test - VerifierStrategy Protocol typing for verify_candidates/run_corpus_delta - build_ideal_responses: single build_corpus_candidates() call (dedup) Co-Authored-By: Claude Opus 4.8 --- .../core/evaluation/verifier_corpus.py | 11 ++++------- .../core/evaluation/verifier_harness.py | 19 ++++++++++++++++--- tests/test_verifier_harness.py | 12 ++++++++++++ 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/src/security_scanner/core/evaluation/verifier_corpus.py b/src/security_scanner/core/evaluation/verifier_corpus.py index e58f083..e02330a 100644 --- a/src/security_scanner/core/evaluation/verifier_corpus.py +++ b/src/security_scanner/core/evaluation/verifier_corpus.py @@ -285,16 +285,13 @@ def build_ideal_responses(candidates: list[Finding] | None = None) -> dict: TP cases -> true_positive; FP cases -> false_positive. Confidence 0.95 so it clears any reasonable min_confidence threshold. """ + # CORPUS_CASES and build_corpus_candidates() share order, so a single build + # aligns each case to its finding (finding ids are deterministic). candidates = candidates or build_corpus_candidates() - by_id = {finding.finding_id: finding for finding in candidates} responses: dict[str, dict] = {} - for case, finding in zip(CORPUS_CASES, build_corpus_candidates()): - # finding ids are deterministic; align case label to its finding. - fid = finding.finding_id - if fid not in by_id: - continue + for case, finding in zip(CORPUS_CASES, candidates): label = "true_positive" if case.label == TP else "false_positive" - responses[fid] = { + responses[finding.finding_id] = { "label": label, "confidence": 0.95, "reason": case.reason or f"Synthetic {case.label} case.", diff --git a/src/security_scanner/core/evaluation/verifier_harness.py b/src/security_scanner/core/evaluation/verifier_harness.py index d3d50c5..29edad4 100644 --- a/src/security_scanner/core/evaluation/verifier_harness.py +++ b/src/security_scanner/core/evaluation/verifier_harness.py @@ -22,6 +22,7 @@ from collections.abc import Mapping from dataclasses import dataclass from pathlib import Path +from typing import Protocol from security_scanner.core.evaluation.metrics import ( EvaluationCorpus, @@ -38,6 +39,12 @@ ) +class VerifierStrategy(Protocol): + """Structural type for a candidate-to-result verifier strategy.""" + + def verify(self, finding: Finding) -> VerifierResult: ... + + @dataclass(frozen=True) class HeuristicVerifierStrategy: """Deterministic path-role anchor strategy (no model, no network).""" @@ -78,11 +85,17 @@ def from_file( cls, path: str | Path, *, min_confidence: float = 0.60 ) -> "RecordedVerifierStrategy": data = json.loads(Path(path).read_text(encoding="utf-8")) - responses = data.get("responses", data) if isinstance(data, Mapping) else data + if not isinstance(data, Mapping): + raise ValueError("Recorded responses file must contain a JSON object.") + responses = data.get("responses", data) + if not isinstance(responses, Mapping): + raise ValueError("Recorded responses must be a mapping.") return cls(responses=responses, min_confidence=min_confidence) -def verify_candidates(candidates: list[Finding], strategy) -> list[Finding]: +def verify_candidates( + candidates: list[Finding], strategy: VerifierStrategy +) -> list[Finding]: """Apply *strategy* to each candidate, returning verified (triaged) findings.""" return [ apply_verifier_result(finding, strategy.verify(finding)) @@ -93,7 +106,7 @@ def verify_candidates(candidates: list[Finding], strategy) -> list[Finding]: def run_corpus_delta( corpus: EvaluationCorpus, candidates: list[Finding], - strategy, + strategy: VerifierStrategy, thresholds: EvaluationThresholds | None = None, ) -> VerifierDeltaResult: """Measure before/after verifier accuracy on a corpus (reuses metrics.py).""" diff --git a/tests/test_verifier_harness.py b/tests/test_verifier_harness.py index 0a89932..3c5ac39 100644 --- a/tests/test_verifier_harness.py +++ b/tests/test_verifier_harness.py @@ -143,3 +143,15 @@ def test_corpus_and_report_are_public_safe(): assert "SCANNER_FAKE_SECRET_TOKEN_" not in candidates_text for line in candidates_text.splitlines(): assert json.loads(line)["evidence"]["secretHash"].startswith("salted-sha256:") + + +@pytest.mark.parametrize( + "payload", + ["[]", "42", "1.5", "true", "null", '"a string"', '{"responses": [1, 2]}'], +) +def test_recorded_from_file_rejects_non_mapping(tmp_path, payload): + """from_file fails closed on any non-object JSON instead of AttributeError later.""" + path = tmp_path / "bad.json" + path.write_text(payload, encoding="utf-8") + with pytest.raises(ValueError): + RecordedVerifierStrategy.from_file(path)