From 4d8d5e77382876430fc24625e757257d23e035a5 Mon Sep 17 00:00:00 2001 From: pureliture Date: Tue, 16 Jun 2026 22:21:01 +0900 Subject: [PATCH 1/3] feat(incremental): branch-aware residual + scan-worker daemon (#12) Close issue #12 remaining gaps on top of the PR #15 queue/worker MVP: - branch as occurrence: scan_worker tags Finding.repo.branch from ScanJob.ref_name; local_scan reads git HEAD context; ScanRunSummary.branch is recorded - FINDING_OBSERVATION items project branch/commit top-level (queryable) - per-branch residual derived within the REPO# partition (no new GSI): residual_by_branch / residual_for_repo, matched on commit==last_seen_sha so a commit at multiple ref tips is residual on every such branch - scan-worker --daemon polling mode with SIGINT/SIGTERM graceful shutdown - rule_pack_version change invalidates the commit ledger -> rescan (verified) Design: status/disposition stays global (STATE#GLOBAL); branch never enters finding identity (fingerprint); evaluate_gate is unchanged (residual is report/observation visibility only). Tests: 505 -> 532 (+27), incl. multi-agent-review fixes (shared-commit residual, daemon dead-letter exit-2, local_scan e2e git context). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/security_scanner/cli/app.py | 72 ++++++++-- .../runtime/branch_residual.py | 136 ++++++++++++++++++ src/security_scanner/runtime/local_scan.py | 37 +++++ src/security_scanner/runtime/scan_worker.py | 74 +++++++++- .../storage/adapters/nosql_db/items.py | 5 + .../storage/adapters/nosql_db/store.py | 39 +++++ src/security_scanner/storage/base.py | 1 + tests/test_branch_residual.py | 121 ++++++++++++++++ tests/test_cli_scan_worker.py | 75 ++++++++++ tests/test_dynamodb_compatible_store.py | 65 +++++++++ tests/test_incremental_scan_storage.py | 44 ++++++ tests/test_local_scan_git_context.py | 63 ++++++++ tests/test_local_scan_runtime.py | 67 +++++++++ tests/test_nosql_db_adapter.py | 30 ++++ tests/test_scan_worker.py | 92 ++++++++++++ 15 files changed, 902 insertions(+), 19 deletions(-) create mode 100644 src/security_scanner/runtime/branch_residual.py create mode 100644 tests/test_branch_residual.py create mode 100644 tests/test_local_scan_git_context.py diff --git a/src/security_scanner/cli/app.py b/src/security_scanner/cli/app.py index 4128359..a80fadb 100644 --- a/src/security_scanner/cli/app.py +++ b/src/security_scanner/cli/app.py @@ -11,7 +11,9 @@ import argparse import os +import signal import sys +import threading from pathlib import Path from security_scanner.catalog.scan_target import ScanTarget @@ -70,9 +72,11 @@ utc_now_iso, ) from security_scanner.runtime.scan_worker import ( + ScanWorkerDaemonSummary, ScanWorkerRequest, ScanWorkerSummary, make_default_scanner, + run_scan_worker, run_scan_worker_once, ) from security_scanner.runtime.verify_artifact import ( @@ -407,9 +411,13 @@ def _render_discovery_summary(summary: IncrementalDiscoverySummary) -> None: def cmd_scan_worker(args: argparse.Namespace) -> int: - """Process queued incremental scan jobs once.""" - if not args.once: - print("error: scan-worker MVP requires --once", file=sys.stderr) + """Process queued incremental scan jobs once or as a polling daemon.""" + if not args.once and not args.daemon: + print("error: scan-worker requires --once or --daemon", file=sys.stderr) + return 2 + if args.once and args.daemon: + print("error: scan-worker --once and --daemon are mutually exclusive", + file=sys.stderr) return 2 if args.storage_backend != "dynamodb": print( @@ -420,16 +428,24 @@ def cmd_scan_worker(args: argparse.Namespace) -> int: try: store = _store_from_args(args) - summary = run_scan_worker_once( - ScanWorkerRequest( - store=store, - fetch_repo=fetch_or_clone, - scanner=make_default_scanner(), - max_jobs=args.max_jobs, - lease_seconds=args.lease_seconds, - worker_id=args.worker_id, - ) + request = ScanWorkerRequest( + store=store, + fetch_repo=fetch_or_clone, + scanner=make_default_scanner(), + max_jobs=args.max_jobs, + lease_seconds=args.lease_seconds, + worker_id=args.worker_id, ) + if args.daemon: + stop = _install_signal_shutdown() + daemon_summary = run_scan_worker( + request, + poll_interval_seconds=args.poll_interval, + should_continue=lambda: not stop.is_set(), + ) + _render_scan_worker_daemon_summary(daemon_summary) + return 2 if daemon_summary.has_permanent_failure else 0 + summary = run_scan_worker_once(request) except Exception as exc: # noqa: BLE001 - fatal storage/runtime error. print(f"error: scan-worker failed: {exc}", file=sys.stderr) return 1 @@ -438,6 +454,26 @@ def cmd_scan_worker(args: argparse.Namespace) -> int: return 2 if summary.has_permanent_failure else 0 +def _install_signal_shutdown() -> threading.Event: + """Return an Event set on SIGINT/SIGTERM for graceful daemon shutdown.""" + stop = threading.Event() + + def _handler(_signum, _frame): # noqa: ANN001 - signal handler signature + stop.set() + + for sig in (signal.SIGINT, signal.SIGTERM): + signal.signal(sig, _handler) + return stop + + +def _render_scan_worker_daemon_summary(summary: ScanWorkerDaemonSummary) -> None: + print(f"polls: {summary.polls}") + print(f"leased: {summary.leased}") + print(f"completed: {summary.completed}") + print(f"retryable: {summary.retryable}") + print(f"dead-lettered: {summary.dead_lettered}") + + def _render_scan_worker_summary(summary: ScanWorkerSummary) -> None: print(f"leased: {summary.leased}") print(f"completed: {summary.completed}") @@ -1091,6 +1127,18 @@ def build_parser() -> argparse.ArgumentParser: action="store_true", help="Process at most --max-jobs jobs and exit.", ) + scan_worker_parser.add_argument( + "--daemon", + action="store_true", + help="Poll the queue continuously until SIGINT/SIGTERM.", + ) + scan_worker_parser.add_argument( + "--poll-interval", + type=float, + default=5.0, + metavar="SECONDS", + help="Idle poll interval for --daemon (default: 5.0).", + ) scan_worker_parser.add_argument( "--max-jobs", type=int, diff --git a/src/security_scanner/runtime/branch_residual.py b/src/security_scanner/runtime/branch_residual.py new file mode 100644 index 0000000..f4a0b42 --- /dev/null +++ b/src/security_scanner/runtime/branch_residual.py @@ -0,0 +1,136 @@ +"""Per-branch residual computation for incremental scanning (issue #12). + +Design (grill L1/L2): finding status/disposition is GLOBAL; branch is an +occurrence dimension only. "Residual on branch B" is DERIVED, not stored: a +finding is residual on B when it appears in an observation at B's latest scanned +commit (``RefState.last_seen_sha``). No new GSI — callers pass observations read +within the ``REPO#`` partition. +""" + +from __future__ import annotations + +from collections.abc import Iterable, Mapping +from dataclasses import dataclass +from typing import Any, Protocol + +from security_scanner.core.finding.model import Finding +from security_scanner.storage.base import RefState + + +def branch_from_ref(ref_name: str | None) -> str | None: + """Derive a short branch name from a git ref. + + branch never enters finding identity (L1). Tags resolve to None since they + are not branches. + """ + if not ref_name: + return None + if ref_name.startswith("refs/heads/"): + return ref_name[len("refs/heads/") :] + if ref_name.startswith("refs/remotes/"): + rest = ref_name[len("refs/remotes/") :] + parts = rest.split("/", 1) + return parts[1] if len(parts) == 2 else parts[0] + if ref_name.startswith("refs/tags/"): + return None + return ref_name + + +def finding_with_context( + finding: Finding, *, commit: str | None, branch: str | None +) -> Finding: + """Tag a finding with scan-context commit/branch (occurrence, not identity). + + None values never clobber existing context; identity (finding_id) is + unaffected because branch/commit are not part of the fingerprint (L1). + """ + if finding.repo.commit == commit and finding.repo.branch == branch: + return finding + data = finding.to_dict() + repo = dict(data["repo"]) + if commit is not None: + repo["commit"] = commit + if branch is not None: + repo["branch"] = branch + data["repo"] = repo + return Finding.from_dict(data) + + +@dataclass(frozen=True) +class BranchResidual: + """Findings still present on a branch at its latest scanned commit.""" + + branch: str + commit: str + finding_ids: list[str] + + +def residual_by_branch( + ref_states: Iterable[RefState], + observations: Iterable[Mapping[str, Any]], +) -> list[BranchResidual]: + """Compute residual findings per branch. + + Parameters + ---------- + ref_states: + Latest known ref states for one repository (``last_seen_sha`` per ref). + observations: + Observation records (top-level ``branch`` / ``commit`` / ``findingId``) + read within the repository partition. + + Returns one ``BranchResidual`` per ref that resolves to a branch (tags + skipped), ordered by branch name. A branch with no matching observation + yields an empty ``finding_ids`` list. + """ + obs_list = list(observations) + results: list[BranchResidual] = [] + for ref in ref_states: + branch = branch_from_ref(ref.ref_name) + if branch is None: + continue + seen: set[str] = set() + finding_ids: list[str] = [] + for obs in obs_list: + # Match on commit only: a commit is scanned once and its observation + # is tagged with the first ref's branch, but a commit reachable from + # several refs is residual on every ref whose tip is that commit + # (branch is occurrence, derived from the ref — not the obs label). + if obs.get("commit") != ref.last_seen_sha: + continue + finding_id = obs.get("findingId") + if finding_id is None or finding_id in seen: + continue + seen.add(finding_id) + finding_ids.append(finding_id) + results.append( + BranchResidual( + branch=branch, + commit=ref.last_seen_sha, + finding_ids=sorted(finding_ids), + ) + ) + return sorted(results, key=lambda r: r.branch) + + +class _ResidualStore(Protocol): + def list_ref_states(self, repo_id: str) -> list[RefState]: ... + + def read_observations_for_repo(self, repo_id: str) -> list[Mapping[str, Any]]: ... + + +def residual_for_repo(store: _ResidualStore, repo_id: str) -> list[BranchResidual]: + """End-to-end per-branch residual for one repository. + + Reads ref states + observations within the repo partition and derives + residual. Status/disposition stays global (L1); this is a derived view. + + NOTE: meaningful only for incrementally-scanned repos, where REF_STATE rows + and observation ``gsi1pk`` are both keyed by the same ``repo_id`` (the + scan-worker path sets ``repo_full_name == repo_id``). For local_scan-only + repos there are no REF_STATE rows, so this returns an empty list. + """ + return residual_by_branch( + store.list_ref_states(repo_id), + store.read_observations_for_repo(repo_id), + ) diff --git a/src/security_scanner/runtime/local_scan.py b/src/security_scanner/runtime/local_scan.py index 1be3db6..64dd48c 100644 --- a/src/security_scanner/runtime/local_scan.py +++ b/src/security_scanner/runtime/local_scan.py @@ -3,12 +3,14 @@ from __future__ import annotations import datetime as dt +import subprocess import uuid from dataclasses import dataclass, field from pathlib import Path from typing import Callable, Protocol from security_scanner.core.finding.model import Finding +from security_scanner.runtime.branch_residual import finding_with_context from security_scanner.scanners.gitleaks.scanner import GitleaksScanner from security_scanner.storage.adapters.nosql_db.transport import ( DynamoDbCompatibleConfig, @@ -102,6 +104,34 @@ def _default_scanner_factory(manifest: Manifest) -> GitleaksScanner: return GitleaksScanner(config_path=manifest.gitleaks_config) +def _git_output(root: Path, args: list[str]) -> str | None: + try: + result = subprocess.run( + ["git", "-C", str(root), *args], + capture_output=True, + text=True, + check=False, + ) + except OSError: + return None + if result.returncode != 0: + return None + return result.stdout.strip() or None + + +def _git_head_context(root: Path) -> tuple[str | None, str | None]: + """Return (branch, commit) for the checkout at *root*, tolerant of failures. + + branch is None for a detached HEAD or when git context is unavailable; both + are occurrence dimensions only (issue #12 L1). + """ + branch = _git_output(root, ["rev-parse", "--abbrev-ref", "HEAD"]) + commit = _git_output(root, ["rev-parse", "HEAD"]) + if branch == "HEAD": # detached HEAD has no branch name + branch = None + return branch, commit + + def _store_for_request( request: LocalScanRequest, store_factory: StoreFactory, @@ -175,6 +205,12 @@ def run_local_scan( scan_run_id=scan_run_id, rule_pack_version=request.rule_pack_version, ) + branch, commit = _git_head_context(root) + if branch is not None or commit is not None: + findings = [ + finding_with_context(finding, commit=commit, branch=branch) + for finding in findings + ] if raw_evidence_store is not None: raw_evidence_store.record_all( findings, scan_run_id=scan_run_id, observed_at=scan_at_iso @@ -187,6 +223,7 @@ def run_local_scan( findings=findings, scan_run_id=scan_run_id, scan_at_iso=scan_at_iso, + branch=branch, ) ) scanned += 1 diff --git a/src/security_scanner/runtime/scan_worker.py b/src/security_scanner/runtime/scan_worker.py index c8fb1a7..e647aac 100644 --- a/src/security_scanner/runtime/scan_worker.py +++ b/src/security_scanner/runtime/scan_worker.py @@ -3,6 +3,7 @@ from __future__ import annotations import datetime as dt +import time import uuid from dataclasses import dataclass from pathlib import Path @@ -10,6 +11,10 @@ from security_scanner.core.finding.model import Finding from security_scanner.core.scan.options import ScanOptions +from security_scanner.runtime.branch_residual import ( + branch_from_ref, + finding_with_context, +) from security_scanner.scanners.gitleaks.scanner import GitleaksScanner from security_scanner.storage.base import ( IncrementalScanStore, @@ -118,7 +123,13 @@ def run_scan_worker_once(request: ScanWorkerRequest) -> ScanWorkerSummary: scan_run_id=scan_run_id, rule_pack_version=job.rule_pack_version, ) - findings = [_finding_with_commit(finding, job.commit_sha) for finding in findings] + branch = branch_from_ref(job.ref_name) + findings = [ + finding_with_context( + finding, commit=job.commit_sha, branch=branch + ) + for finding in findings + ] scanned_at = _now(request) request.store.complete_processed_job( job, @@ -152,6 +163,61 @@ def run_scan_worker_once(request: ScanWorkerRequest) -> ScanWorkerSummary: ) +@dataclass(frozen=True) +class ScanWorkerDaemonSummary: + """Aggregated summary across a daemon poll loop.""" + + polls: int = 0 + leased: int = 0 + completed: int = 0 + retryable: int = 0 + dead_lettered: int = 0 + + @property + def has_permanent_failure(self) -> bool: + return self.dead_lettered > 0 + + +def run_scan_worker( + request: ScanWorkerRequest, + *, + poll_interval_seconds: float, + max_polls: int | None = None, + sleep: Callable[[float], None] = time.sleep, + should_continue: Callable[[], bool] = lambda: True, +) -> ScanWorkerDaemonSummary: + """Poll the queue until shutdown, draining work then idling. + + Each poll runs one bounded ``run_scan_worker_once``. When a poll leases no + work the loop sleeps ``poll_interval_seconds`` before the next poll; when it + finds work it polls again immediately to drain the backlog. The loop stops + when ``should_continue()`` is False or ``max_polls`` is reached. Sleep and + shutdown are injected so the loop is deterministic under test. + """ + polls = leased = completed = retryable = dead_lettered = 0 + + def _bounded() -> bool: + return max_polls is None or polls < max_polls + + while should_continue() and _bounded(): + summary = run_scan_worker_once(request) + polls += 1 + leased += summary.leased + completed += summary.completed + retryable += summary.retryable + dead_lettered += summary.dead_lettered + if summary.leased == 0 and should_continue() and _bounded(): + sleep(poll_interval_seconds) + + return ScanWorkerDaemonSummary( + polls=polls, + leased=leased, + completed=completed, + retryable=retryable, + dead_lettered=dead_lettered, + ) + + def make_default_scanner() -> GitleaksScanner: """Return the default commit scanner.""" return GitleaksScanner() @@ -181,12 +247,6 @@ def _ledger_for_job( ) -def _finding_with_commit(finding: Finding, commit_sha: str) -> Finding: - if finding.repo.commit == commit_sha: - return finding - data = finding.to_dict() - data["repo"] = {**data["repo"], "commit": commit_sha} - return Finding.from_dict(data) def _now(request: ScanWorkerRequest) -> dt.datetime: diff --git a/src/security_scanner/storage/adapters/nosql_db/items.py b/src/security_scanner/storage/adapters/nosql_db/items.py index 89251fd..58a6e01 100644 --- a/src/security_scanner/storage/adapters/nosql_db/items.py +++ b/src/security_scanner/storage/adapters/nosql_db/items.py @@ -563,6 +563,11 @@ def finding_to_items(finding: Finding) -> list[dict[str, Any]]: "file": finding.location.file_path, "startLine": finding.location.line_start, "fingerprint": finding.fingerprint, + # occurrence dimensions projected top-level for per-branch residual + # queries within the REPO# partition (issue #12 L2). None values + # are dropped by without_none() for back-compat with legacy findings. + "branch": finding.repo.branch, + "commit": finding.repo.commit, "findingSnapshot": finding.to_dict(), } state_item = { diff --git a/src/security_scanner/storage/adapters/nosql_db/store.py b/src/security_scanner/storage/adapters/nosql_db/store.py index 9ad1b34..2166080 100644 --- a/src/security_scanner/storage/adapters/nosql_db/store.py +++ b/src/security_scanner/storage/adapters/nosql_db/store.py @@ -398,6 +398,7 @@ def write_scan_result(self, result: TargetScanResult) -> None: repo_key=result.target_name, scan_run_id=result.scan_run_id, scan_at_iso=result.scan_at_iso, + branch=result.branch, counts_total=len(findings), counts_by_label=counts_by_label, ) @@ -456,6 +457,44 @@ def read_for_scan_run(self, scan_run_id: str) -> list[Finding]: ) return merge_finding_states(findings, state_by_id) + def list_ref_states(self, repo_id: str) -> list[RefState]: + """Return all known ref states for one repository (issue #12).""" + items = query_all_pages( + self._table, + KeyConditionExpression="PK = :pk AND begins_with(SK, :sk_prefix)", + ExpressionAttributeValues={ + ":pk": f"REPO#{repo_id}", + ":sk_prefix": "REF#", + }, + ) + return [ + ref_state_from_item(item) + for item in items + if item.get("entityType") == "REF_STATE" + ] + + def read_observations_for_repo(self, repo_id: str) -> list[dict[str, Any]]: + """Return raw FINDING_OBSERVATION items for one repository. + + Used to derive per-branch residual within the REPO# partition + (issue #12 L2) without a dedicated branch index. + """ + # Observations use gsi1sk="RUN#...", so the begins_with prefix narrows the + # GSI1 partition read server-side; the entityType filter is belt-and-braces + # (FINDING/FINDING_STATE use "FINDING#", STATE_EVENT uses "STATE_EVENT#"). + items = query_all_pages( + self._table, + IndexName=GSI1_NAME, + KeyConditionExpression="gsi1pk = :pk AND begins_with(gsi1sk, :sk_prefix)", + ExpressionAttributeValues={ + ":pk": f"REPO#{repo_id}", + ":sk_prefix": "RUN#", + }, + ) + return [ + item for item in items if item.get("entityType") == "FINDING_OBSERVATION" + ] + def read_finding_state(self, finding_id: str) -> dict[str, Any] | None: response = self._table.get_item( Key={"PK": f"FINDING#{finding_id}", "SK": f"STATE#{STATE_SCOPE_GLOBAL}"} diff --git a/src/security_scanner/storage/base.py b/src/security_scanner/storage/base.py index a78bb4c..79c10d1 100644 --- a/src/security_scanner/storage/base.py +++ b/src/security_scanner/storage/base.py @@ -22,6 +22,7 @@ class TargetScanResult: findings: Sequence[Finding] scan_run_id: str scan_at_iso: str + branch: str | None = None @dataclass(frozen=True) diff --git a/tests/test_branch_residual.py b/tests/test_branch_residual.py new file mode 100644 index 0000000..b48febc --- /dev/null +++ b/tests/test_branch_residual.py @@ -0,0 +1,121 @@ +"""Tests for per-branch residual computation (issue #12 L1/L2).""" + +from __future__ import annotations + +import datetime as dt + +from security_scanner.core.finding.model import Finding +from security_scanner.runtime.branch_residual import ( + BranchResidual, + branch_from_ref, + finding_with_context, + residual_by_branch, +) +from security_scanner.storage.base import RefState + + +def _finding(*, branch=None, commit=None) -> Finding: + return Finding.create( + repo_full_name="org/repo", + repo_branch=branch, + repo_commit=commit, + rule_id="generic-api-key", + file_path="src/config.py", + line_start=10, + raw_secret="synthetic-value", + source_tool="gitleaks", + scan_run_id="scan_x", + rule_pack_version="secret-rules-0.1.0", + ) + + +def test_finding_with_context_tags_commit_and_branch(): + tagged = finding_with_context(_finding(), commit="abc", branch="main") + + assert tagged.repo.commit == "abc" + assert tagged.repo.branch == "main" + + +def test_finding_with_context_none_does_not_clobber(): + base = _finding(branch="main", commit="abc") + + tagged = finding_with_context(base, commit=None, branch=None) + + assert tagged.repo.branch == "main" + assert tagged.repo.commit == "abc" + + +def test_finding_with_context_preserves_identity(): + base = _finding() + + tagged = finding_with_context(base, commit="abc", branch="main") + + assert tagged.finding_id == base.finding_id + + +NOW = dt.datetime(2026, 6, 16, tzinfo=dt.UTC) + + +def _ref(ref_name: str, sha: str) -> RefState: + return RefState( + repo_id="repo_x", + repo_url="https://example/r", + ref_name=ref_name, + last_seen_sha=sha, + updated_at=NOW, + ) + + +def test_branch_from_ref_variants(): + assert branch_from_ref("refs/heads/main") == "main" + assert branch_from_ref("refs/remotes/origin/main") == "main" + assert branch_from_ref("refs/remotes/origin/feat/x") == "feat/x" + assert branch_from_ref("refs/tags/v1") is None + assert branch_from_ref(None) is None + + +def test_residual_is_only_findings_at_latest_scanned_commit_per_branch(): + refs = [_ref("refs/heads/main", "S2"), _ref("refs/heads/feat", "F1")] + observations = [ + {"branch": "main", "commit": "S1", "findingId": "f_old"}, # stale -> excluded + {"branch": "main", "commit": "S2", "findingId": "f_a"}, + {"branch": "main", "commit": "S2", "findingId": "f_a"}, # dedup + {"branch": "feat", "commit": "F1", "findingId": "f_b"}, + {"branch": "feat", "commit": "F0", "findingId": "f_gone"}, # stale -> excluded + ] + + result = {r.branch: r for r in residual_by_branch(refs, observations)} + + assert result["main"] == BranchResidual( + branch="main", commit="S2", finding_ids=["f_a"] + ) + assert result["feat"].finding_ids == ["f_b"] + + +def test_residual_shared_commit_counts_for_every_ref_at_that_commit(): + # A commit is scanned once, so its observation is tagged with only the first + # ref's branch. Both refs pointing at that commit must still report it as + # residual (branch is occurrence, derived from the ref — not the obs label). + refs = [_ref("refs/heads/main", "C"), _ref("refs/heads/release", "C")] + observations = [{"branch": "main", "commit": "C", "findingId": "f_shared"}] + + result = {r.branch: r for r in residual_by_branch(refs, observations)} + + assert result["main"].finding_ids == ["f_shared"] + assert result["release"].finding_ids == ["f_shared"] + + +def test_residual_skips_tag_refs(): + refs = [_ref("refs/tags/v1", "T1")] + observations = [{"branch": None, "commit": "T1", "findingId": "x"}] + + assert residual_by_branch(refs, observations) == [] + + +def test_residual_branch_with_no_matching_observations_is_empty_list(): + refs = [_ref("refs/heads/main", "S9")] + observations = [{"branch": "main", "commit": "S1", "findingId": "f_old"}] + + result = residual_by_branch(refs, observations) + + assert result == [BranchResidual(branch="main", commit="S9", finding_ids=[])] diff --git a/tests/test_cli_scan_worker.py b/tests/test_cli_scan_worker.py index 9b5b676..217b525 100644 --- a/tests/test_cli_scan_worker.py +++ b/tests/test_cli_scan_worker.py @@ -190,3 +190,78 @@ def test_scan_worker_rejects_jsonl_backend(capsys): captured = capsys.readouterr() assert exit_code == 2 assert "dynamodb only" in captured.err + + +def test_scan_worker_daemon_wires_poll_interval_and_runs(monkeypatch, tmp_path, capsys): + from security_scanner.runtime.scan_worker import ScanWorkerDaemonSummary + + store = FakeWorkerStore() + scanner = FakeScanner() + _patch_worker(monkeypatch, store, scanner, tmp_path) + + captured_kwargs: dict = {} + + def _fake_daemon(request, **kwargs): + captured_kwargs.update(kwargs) + return ScanWorkerDaemonSummary(polls=3, leased=1, completed=1) + + monkeypatch.setattr("security_scanner.cli.app.run_scan_worker", _fake_daemon) + + exit_code = main( + [ + "scan-worker", + "--daemon", + "--poll-interval", + "0.5", + "--storage-backend", + "dynamodb", + ] + ) + + out = capsys.readouterr().out + assert exit_code == 0 + assert captured_kwargs["poll_interval_seconds"] == 0.5 + assert callable(captured_kwargs["should_continue"]) + assert "polls: 3" in out + assert "completed: 1" in out + + +def test_scan_worker_rejects_once_and_daemon_together(capsys): + exit_code = main(["scan-worker", "--once", "--daemon", "--storage-backend", "dynamodb"]) + captured = capsys.readouterr() + assert exit_code == 2 + assert "mutually exclusive" in captured.err + + +def test_scan_worker_daemon_exits_two_on_dead_letter(monkeypatch, tmp_path, capsys): + from security_scanner.runtime.scan_worker import ScanWorkerDaemonSummary + + store = FakeWorkerStore() + scanner = FakeScanner() + _patch_worker(monkeypatch, store, scanner, tmp_path) + monkeypatch.setattr( + "security_scanner.cli.app.run_scan_worker", + lambda request, **kwargs: ScanWorkerDaemonSummary(polls=1, dead_lettered=1), + ) + + exit_code = main(["scan-worker", "--daemon", "--storage-backend", "dynamodb"]) + + assert exit_code == 2 + assert "dead-lettered: 1" in capsys.readouterr().out + + +def test_install_signal_shutdown_sets_event_on_signal(): + import signal + + from security_scanner.cli.app import _install_signal_shutdown + + originals = {s: signal.getsignal(s) for s in (signal.SIGINT, signal.SIGTERM)} + try: + stop = _install_signal_shutdown() + assert not stop.is_set() + handler = signal.getsignal(signal.SIGTERM) + handler(signal.SIGTERM, None) # simulate delivery + assert stop.is_set() + finally: + for sig, original in originals.items(): + signal.signal(sig, original) diff --git a/tests/test_dynamodb_compatible_store.py b/tests/test_dynamodb_compatible_store.py index a57f322..310e188 100644 --- a/tests/test_dynamodb_compatible_store.py +++ b/tests/test_dynamodb_compatible_store.py @@ -6,9 +6,13 @@ from __future__ import annotations +import datetime as dt + import pytest from boto3.dynamodb.types import TypeDeserializer +from security_scanner.runtime.branch_residual import residual_for_repo + from security_scanner.core.finding.model import ( Finding, GitleaksFindingPayload, @@ -20,6 +24,7 @@ FindingReader, FindingStateEvent, FindingStore, + RefState, ScanResultWriter, StorageBootstrap, TargetScanResult, @@ -1287,3 +1292,63 @@ def test_factory_can_create_dynamodb_store(): ) assert isinstance(store, DynamoDbCompatibleFindingStore) + + +def test_write_scan_result_records_branch_on_scan_run_summary(): + from pathlib import Path + + from security_scanner.storage.base import TargetScanResult + + table = FakeDynamoTable() + store = DynamoDbCompatibleFindingStore( + DynamoDbCompatibleConfig(table_name="SecurityScannerLocal"), + resource=FakeDynamoResource(table), + client=FakeDynamoClient(), + ) + + store.write_scan_result( + TargetScanResult( + target_name="fake-org/fake-repo", + root=Path("/synthetic/fake-repo"), + enabled=True, + findings=[], + scan_run_id="scan_branchtest", + scan_at_iso="2026-06-16T00:00:00+00:00", + branch="main", + ) + ) + + runs = store.read_scan_runs_for_repo("fake-org/fake-repo") + assert len(runs) == 1 + assert runs[0].branch == "main" + + +def test_residual_for_repo_derives_per_branch_from_latest_commit(): + table = FakeDynamoTable() + store = DynamoDbCompatibleFindingStore( + DynamoDbCompatibleConfig(table_name="SecurityScannerLocal"), + resource=FakeDynamoResource(table), + client=FakeDynamoClient(), + ) + repo = "fake-org/fake-repo" + store.put_ref_state( + RefState( + repo_id=repo, + repo_url="https://example/r", + ref_name="refs/heads/main", + last_seen_sha="S2", + updated_at=dt.datetime(2026, 6, 16, tzinfo=dt.UTC), + ) + ) + residual = _make(repo_branch="main", repo_commit="S2", line_start=10) + stale = _make(repo_branch="main", repo_commit="S1", line_start=20) + for finding in (residual, stale): + for item in finding_to_items(finding): + table.put_item(Item=item) + + result = residual_for_repo(store, repo) + + assert len(result) == 1 + assert result[0].branch == "main" + assert result[0].commit == "S2" + assert result[0].finding_ids == [residual.finding_id] diff --git a/tests/test_incremental_scan_storage.py b/tests/test_incremental_scan_storage.py index 6cc8dc4..f74dd36 100644 --- a/tests/test_incremental_scan_storage.py +++ b/tests/test_incremental_scan_storage.py @@ -562,3 +562,47 @@ def test_ledger_present_leased_job_completes_without_rewriting_findings(): assert written_types == ["SCAN_JOB"] status = store.get_queue_status(now=NOW + dt.timedelta(minutes=10)) assert status.job_counts_by_status == {"completed": 1} + + +def _job_with_rule_pack(rule_pack: str) -> ScanJob: + base = _make_job() + return ScanJob( + **{ + **base.__dict__, + "rule_pack_version": rule_pack, + "job_id": scan_job_id_for( + repo_id=REPO_ID, + commit_sha=base.commit_sha, + scanner_name=SCANNER_NAME, + scanner_version=SCANNER_VERSION, + rule_pack_version=rule_pack, + scanner_config_hash=SCANNER_CONFIG_HASH, + ), + } + ) + + +def test_rule_pack_version_change_invalidates_ledger_and_triggers_rescan(): + # issue #12 criterion 3: same commit is NOT re-scanned for the same + # scanner/rule/config tuple, but a changed rule_pack_version is a new tuple + # and MUST re-scan. + store, table = _make_store() + v1 = _make_job() # rule_pack = RULE_PACK_VERSION (v1) + v2 = _job_with_rule_pack("secret-rules-0.2.0") + + # rule_pack is part of the content-addressed job id and ledger key. + assert v2.job_id != v1.job_id + assert v2.ledger_key != v1.ledger_key + + # v1 already completed -> its ledger exists. + table.put_item(Item=scan_ledger_entry_to_item(_make_ledger(v1))) + + # same commit at v1 is skipped (already scanned); v2 is a fresh tuple -> rescan. + assert store.enqueue_commit_scan_job(v1) is False + assert store.enqueue_commit_scan_job(v2) is True + # v2 enqueue is then idempotent. + assert store.enqueue_commit_scan_job(v2) is False + + scan_jobs = [item for item in table.items if item["entityType"] == "SCAN_JOB"] + assert len(scan_jobs) == 1 + assert scan_jobs[0]["rulePackVersion"] == "secret-rules-0.2.0" diff --git a/tests/test_local_scan_git_context.py b/tests/test_local_scan_git_context.py new file mode 100644 index 0000000..78384ef --- /dev/null +++ b/tests/test_local_scan_git_context.py @@ -0,0 +1,63 @@ +"""Hermetic tests for local_scan git HEAD context (issue #12 M5).""" + +from __future__ import annotations + +import subprocess +from pathlib import Path + +from security_scanner.runtime.local_scan import _git_head_context + + +def _git(root: Path, *args: str) -> None: + subprocess.run( + ["git", "-C", str(root), *args], + check=True, + capture_output=True, + text=True, + ) + + +def _init_repo(root: Path) -> str: + root.mkdir(parents=True, exist_ok=True) + _git(root, "init", "-q") + _git(root, "config", "user.email", "t@example.com") + _git(root, "config", "user.name", "t") + _git(root, "checkout", "-q", "-b", "main") + (root / "f.txt").write_text("x\n", encoding="utf-8") + _git(root, "add", "f.txt") + _git(root, "commit", "-q", "-m", "init") + head = subprocess.run( + ["git", "-C", str(root), "rev-parse", "HEAD"], + check=True, + capture_output=True, + text=True, + ) + return head.stdout.strip() + + +def test_git_head_context_returns_branch_and_commit(tmp_path): + root = tmp_path / "repo" + sha = _init_repo(root) + + branch, commit = _git_head_context(root) + + assert branch == "main" + assert commit == sha + + +def test_git_head_context_detached_head_has_no_branch(tmp_path): + root = tmp_path / "repo" + sha = _init_repo(root) + _git(root, "checkout", "-q", sha) # detach + + branch, commit = _git_head_context(root) + + assert branch is None + assert commit == sha + + +def test_git_head_context_non_git_dir_is_tolerant(tmp_path): + branch, commit = _git_head_context(tmp_path / "not-a-repo") + + assert branch is None + assert commit is None diff --git a/tests/test_local_scan_runtime.py b/tests/test_local_scan_runtime.py index c2f5e4a..659782a 100644 --- a/tests/test_local_scan_runtime.py +++ b/tests/test_local_scan_runtime.py @@ -3,6 +3,7 @@ from __future__ import annotations import json +import subprocess from pathlib import Path from security_scanner.core.finding.model import Finding, GitleaksFindingPayload @@ -355,3 +356,69 @@ def test_run_local_scan_in_memory_manifest_wins_over_manifest_path(tmp_path): assert result.target_results[0].target_name == "demo-org/demo-repo" assert scanner.calls[0]["repo_full_name"] == "demo-org/demo-repo" + + +def _init_git_repo(root: Path) -> str: + root.mkdir(parents=True, exist_ok=True) + + def _git(*args: str) -> None: + subprocess.run( + ["git", "-C", str(root), *args], + check=True, + capture_output=True, + text=True, + ) + + _git("init", "-q") + _git("config", "user.email", "t@example.com") + _git("config", "user.name", "t") + _git("checkout", "-q", "-b", "main") + (root / "f.txt").write_text("x\n", encoding="utf-8") + _git("add", "f.txt") + _git("commit", "-q", "-m", "init") + head = subprocess.run( + ["git", "-C", str(root), "rev-parse", "HEAD"], + check=True, + capture_output=True, + text=True, + ) + return head.stdout.strip() + + +def test_run_local_scan_tags_findings_with_git_branch_and_records_summary(tmp_path): + repo = tmp_path / "repo" + sha = _init_git_repo(repo) + manifest = tmp_path / "targets.yaml" + _write_manifest(manifest, repo) + finding = Finding.create( + repo_full_name="demo-org/demo-repo", + rule_id="generic-api-key", + file_path="src/config.py", + line_start=10, + raw_secret="synthetic-test-secret", + source_tool="gitleaks", + scan_run_id="scan_test0001", + rule_pack_version="secret-rules-0.1.0", + ) + store = FakeStore() + scanner = FakeScanner([finding]) + + run_local_scan( + LocalScanRequest( + manifest_path=manifest, + output_destination="findings.jsonl", + storage_backend="jsonl", + ), + workspace=FakeWorkspace(root=repo), + scanner_factory=lambda _manifest: scanner, + store=store, + scan_run_id_factory=lambda: "scan_test0001", + now_factory=lambda: "2026-05-24T00:00:00+00:00", + ) + + result = store.scan_results[0] + assert result.branch == "main" + assert result.findings[0].repo.branch == "main" + assert result.findings[0].repo.commit == sha + # identity must remain stable after tagging (branch/commit are occurrence) + assert result.findings[0].finding_id == finding.finding_id diff --git a/tests/test_nosql_db_adapter.py b/tests/test_nosql_db_adapter.py index cb403cd..418b6f4 100644 --- a/tests/test_nosql_db_adapter.py +++ b/tests/test_nosql_db_adapter.py @@ -60,6 +60,36 @@ def test_items_exposes_finding_item_mapping(): assert items[2]["SK"] == "STATE#GLOBAL" +def test_observation_item_projects_branch_and_commit_top_level(): + finding = Finding.create( + repo_full_name="fake-org/fake-repo", + repo_branch="main", + repo_commit="c" * 40, + rule_id="generic-api-key", + file_path="src/config.py", + line_start=10, + raw_secret=FAKE_SECRET, + source_tool="gitleaks", + scan_run_id=SCAN_RUN_ID, + rule_pack_version=RULE_PACK, + ) + + observation = finding_to_items(finding)[1] + + assert observation["entityType"] == "FINDING_OBSERVATION" + # top-level (queryable), not only inside findingSnapshot (issue #12 L2) + assert observation["branch"] == "main" + assert observation["commit"] == "c" * 40 + + +def test_observation_item_omits_branch_commit_when_absent(): + observation = finding_to_items(_finding())[1] + + # without_none drops absent occurrence dimensions (back-compat) + assert "branch" not in observation + assert "commit" not in observation + + def test_access_query_all_pages_honors_limit(): class FakeTable: def __init__(self) -> None: diff --git a/tests/test_scan_worker.py b/tests/test_scan_worker.py index e0bf197..91fa497 100644 --- a/tests/test_scan_worker.py +++ b/tests/test_scan_worker.py @@ -9,6 +9,7 @@ from security_scanner.core.scan.options import ScanOptions from security_scanner.runtime.scan_worker import ( ScanWorkerRequest, + run_scan_worker, run_scan_worker_once, ) from security_scanner.storage.base import ScanJob, ScanLedgerEntry, ScanLedgerKey @@ -208,6 +209,41 @@ def test_one_pending_job_is_scanned_and_completed_with_commit_log_opts(): assert store.repo_release_calls == [(REPO_ID, "worker-a")] +def test_completed_finding_is_tagged_with_branch_and_commit_from_job(): + finding = _finding(commit=None) + store = FakeWorkerStore([_job()]) # ref_name="refs/remotes/origin/main" + scanner = FakeScanner(findings=[finding]) + + run_scan_worker_once(_request(store, scanner)) + + _, findings, _ = store.completed[0] + assert findings[0].repo.commit == COMMIT_SHA + assert findings[0].repo.branch == "main" + + +def test_branch_tag_preserves_stable_finding_identity(): + # branch/commit are occurrence, not identity: tagging must not change finding_id. + untagged = _finding(commit=None) + store = FakeWorkerStore([_job()]) + scanner = FakeScanner(findings=[_finding(commit=None)]) + + run_scan_worker_once(_request(store, scanner)) + + _, findings, _ = store.completed[0] + assert findings[0].finding_id == untagged.finding_id + + +def test_branch_derivation_from_nested_feature_ref(): + job = ScanJob(**{**_job().__dict__, "ref_name": "refs/heads/feat/x"}) + store = FakeWorkerStore([job]) + scanner = FakeScanner(findings=[_finding(commit=None)]) + + run_scan_worker_once(_request(store, scanner)) + + _, findings, _ = store.completed[0] + assert findings[0].repo.branch == "feat/x" + + def test_repo_lease_failure_returns_job_to_pending_without_scanner_or_attempt(): store = FakeWorkerStore([_job()]) store.repo_lease_available = False @@ -294,3 +330,59 @@ def test_ledger_present_job_completes_without_fetching_or_scanning(): assert fetch_calls == [] assert store.repo_lease_calls == [] assert store.completed[0][2].job_id == job.job_id + + +def test_daemon_drains_work_then_sleeps_when_idle(): + store = FakeWorkerStore([_job()]) + scanner = FakeScanner(findings=[_finding(commit=None)]) + sleeps: list[float] = [] + + summary = run_scan_worker( + _request(store, scanner), + poll_interval_seconds=5.0, + max_polls=3, + sleep=sleeps.append, + ) + + # poll 1 leases+completes (no sleep), poll 2 idle (sleeps), poll 3 idle final + # (no sleep after the last bounded poll). + assert summary.polls == 3 + assert summary.completed == 1 + assert summary.leased == 1 + assert sleeps == [5.0] + + +def test_daemon_stops_when_should_continue_returns_false(): + store = FakeWorkerStore([_job(), _job()]) + scanner = FakeScanner(findings=[_finding(commit=None)]) + calls = {"n": 0} + + def should_continue() -> bool: + calls["n"] += 1 + return calls["n"] <= 1 # allow exactly one poll + + summary = run_scan_worker( + _request(store, scanner), + poll_interval_seconds=1.0, + sleep=lambda _s: None, + should_continue=should_continue, + ) + + assert summary.polls == 1 + + +def test_daemon_does_not_sleep_after_final_bounded_poll(): + store = FakeWorkerStore() # always idle + scanner = FakeScanner() + sleeps: list[float] = [] + + summary = run_scan_worker( + _request(store, scanner), + poll_interval_seconds=3.0, + max_polls=2, + sleep=sleeps.append, + ) + + # idle each poll -> sleeps between polls but NOT after the last bounded poll. + assert summary.polls == 2 + assert sleeps == [3.0] From dd392714ab5518d7fff68f2da885c70dc2c5e920 Mon Sep 17 00:00:00 2001 From: pureliture Date: Tue, 16 Jun 2026 22:29:40 +0900 Subject: [PATCH 2/3] review: address PR #22 feedback - residual_by_branch: index observations by commit (O(N+M) instead of O(N*M)) - finding_with_context: short-circuit when commit and branch are both None - _install_signal_shutdown: skip signal.signal off the main thread (avoids ValueError when not on main thread) - _ResidualStore Protocol: docstring stubs instead of `...` (CodeQL: statement has no effect) Co-Authored-By: Claude Opus 4.8 (1M context) --- src/security_scanner/cli/app.py | 9 ++++- .../runtime/branch_residual.py | 40 ++++++++++--------- 2 files changed, 29 insertions(+), 20 deletions(-) diff --git a/src/security_scanner/cli/app.py b/src/security_scanner/cli/app.py index a80fadb..f6f1aeb 100644 --- a/src/security_scanner/cli/app.py +++ b/src/security_scanner/cli/app.py @@ -455,8 +455,15 @@ def cmd_scan_worker(args: argparse.Namespace) -> int: def _install_signal_shutdown() -> threading.Event: - """Return an Event set on SIGINT/SIGTERM for graceful daemon shutdown.""" + """Return an Event set on SIGINT/SIGTERM for graceful daemon shutdown. + + signal.signal() may only be called from the main thread; off the main thread + (e.g. embedded in a worker thread or some test runners) we skip registration + and return an Event that the caller can still set manually. + """ stop = threading.Event() + if threading.current_thread() is not threading.main_thread(): + return stop def _handler(_signum, _frame): # noqa: ANN001 - signal handler signature stop.set() diff --git a/src/security_scanner/runtime/branch_residual.py b/src/security_scanner/runtime/branch_residual.py index f4a0b42..f365b1a 100644 --- a/src/security_scanner/runtime/branch_residual.py +++ b/src/security_scanner/runtime/branch_residual.py @@ -44,6 +44,8 @@ def finding_with_context( None values never clobber existing context; identity (finding_id) is unaffected because branch/commit are not part of the fingerprint (L1). """ + if commit is None and branch is None: + return finding if finding.repo.commit == commit and finding.repo.branch == branch: return finding data = finding.to_dict() @@ -83,40 +85,40 @@ def residual_by_branch( skipped), ordered by branch name. A branch with no matching observation yields an empty ``finding_ids`` list. """ - obs_list = list(observations) + # Index observations by commit once (O(M)). Matching is on commit only: a + # commit is scanned once and its observation carries the first ref's branch, + # but a commit reachable from several refs is residual on every ref whose tip + # is that commit (branch is occurrence, derived from the ref). + finding_ids_by_commit: dict[str, list[str]] = {} + for obs in observations: + commit = obs.get("commit") + finding_id = obs.get("findingId") + if commit is None or finding_id is None: + continue + finding_ids_by_commit.setdefault(commit, []).append(finding_id) + results: list[BranchResidual] = [] - for ref in ref_states: + for ref in ref_states: # O(N) branch = branch_from_ref(ref.ref_name) if branch is None: continue - seen: set[str] = set() - finding_ids: list[str] = [] - for obs in obs_list: - # Match on commit only: a commit is scanned once and its observation - # is tagged with the first ref's branch, but a commit reachable from - # several refs is residual on every ref whose tip is that commit - # (branch is occurrence, derived from the ref — not the obs label). - if obs.get("commit") != ref.last_seen_sha: - continue - finding_id = obs.get("findingId") - if finding_id is None or finding_id in seen: - continue - seen.add(finding_id) - finding_ids.append(finding_id) + ids = finding_ids_by_commit.get(ref.last_seen_sha, []) results.append( BranchResidual( branch=branch, commit=ref.last_seen_sha, - finding_ids=sorted(finding_ids), + finding_ids=sorted(set(ids)), ) ) return sorted(results, key=lambda r: r.branch) class _ResidualStore(Protocol): - def list_ref_states(self, repo_id: str) -> list[RefState]: ... + def list_ref_states(self, repo_id: str) -> list[RefState]: + """Return ref states for the repository.""" - def read_observations_for_repo(self, repo_id: str) -> list[Mapping[str, Any]]: ... + def read_observations_for_repo(self, repo_id: str) -> list[Mapping[str, Any]]: + """Return observation records for the repository.""" def residual_for_repo(store: _ResidualStore, repo_id: str) -> list[BranchResidual]: From 91faa5b8c55d625505ee1194bb5f7f649a7f98a2 Mon Sep 17 00:00:00 2001 From: pureliture Date: Tue, 16 Jun 2026 22:40:01 +0900 Subject: [PATCH 3/3] feat(cli): add `residual` subcommand for per-branch residual (#12) Surfaces per-branch residual findings (residual_for_repo) via a read-only dynamodb-backed CLI, closing issue #12 criterion 6 visibility. The report generator operates on a single scan run and lacks REF_STATE/cross-ref context, so residual gets its own command (mirrors queue-status). GSI sharding for hot-partition at cloud scale is split to #23. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/security_scanner/cli/app.py | 51 ++++++++++++++++++++++++++++++++ tests/test_cli_residual.py | 52 +++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 tests/test_cli_residual.py diff --git a/src/security_scanner/cli/app.py b/src/security_scanner/cli/app.py index f6f1aeb..1fe50bb 100644 --- a/src/security_scanner/cli/app.py +++ b/src/security_scanner/cli/app.py @@ -71,6 +71,10 @@ run_scan_all, utc_now_iso, ) +from security_scanner.runtime.branch_residual import ( + BranchResidual, + residual_for_repo, +) from security_scanner.runtime.scan_worker import ( ScanWorkerDaemonSummary, ScanWorkerRequest, @@ -508,6 +512,40 @@ def cmd_queue_status(args: argparse.Namespace) -> int: return 0 +def cmd_residual(args: argparse.Namespace) -> int: + """Show per-branch residual findings for one repository (issue #12).""" + if args.storage_backend != "dynamodb": + print( + "error: residual supports --storage-backend dynamodb only", + file=sys.stderr, + ) + return 2 + + try: + store = _store_from_args(args) + residuals = residual_for_repo(store, args.repo) + except Exception as exc: # noqa: BLE001 - fatal storage/runtime error. + print(f"error: residual failed: {exc}", file=sys.stderr) + return 1 + + print(_render_residual(args.repo, residuals), end="") + return 0 + + +def _render_residual(repo: str, residuals: list[BranchResidual]) -> str: + lines = [f"repo: {repo}"] + if not residuals: + lines.append(" (no branch residual — repo not incrementally scanned?)") + for residual in residuals: + lines.append( + f" branch {residual.branch} @ {residual.commit}: " + f"{len(residual.finding_ids)} residual finding(s)" + ) + for finding_id in residual.finding_ids: + lines.append(f" - {finding_id}") + return "\n".join(lines) + "\n" + + def cmd_doctor(args: argparse.Namespace) -> int: """Check local runtime dependencies and optional private SCM auth.""" result = run_doctor( @@ -1176,6 +1214,19 @@ def build_parser() -> argparse.ArgumentParser: _add_incremental_storage_args(queue_status_parser) queue_status_parser.set_defaults(func=cmd_queue_status) + residual_parser = subparsers.add_parser( + "residual", + help="Show per-branch residual findings for a repository.", + ) + residual_parser.add_argument( + "--repo", + required=True, + metavar="REPO_ID", + help="Repository id (incrementally-scanned repo_id) to report residual for.", + ) + _add_incremental_storage_args(residual_parser) + residual_parser.set_defaults(func=cmd_residual) + doctor_parser = subparsers.add_parser( "doctor", help="Check local binaries and optional private SCM auth.", diff --git a/tests/test_cli_residual.py b/tests/test_cli_residual.py new file mode 100644 index 0000000..7549087 --- /dev/null +++ b/tests/test_cli_residual.py @@ -0,0 +1,52 @@ +"""CLI tests for the residual subcommand (issue #12).""" + +from __future__ import annotations + +import datetime as dt + +from security_scanner.cli import main +from security_scanner.storage.base import RefState + +NOW = dt.datetime(2026, 6, 16, tzinfo=dt.UTC) + + +class FakeResidualStore: + def list_ref_states(self, repo_id: str) -> list[RefState]: + return [ + RefState( + repo_id=repo_id, + repo_url="https://example/r", + ref_name="refs/heads/main", + last_seen_sha="S2", + updated_at=NOW, + ) + ] + + def read_observations_for_repo(self, repo_id: str) -> list[dict]: + return [ + {"branch": "main", "commit": "S2", "findingId": "finding_residual"}, + {"branch": "main", "commit": "S1", "findingId": "finding_stale"}, + ] + + +def test_residual_cli_renders_per_branch_residual(monkeypatch, capsys): + monkeypatch.setattr( + "security_scanner.cli.app.create_finding_store", + lambda backend, **kwargs: FakeResidualStore(), + ) + + exit_code = main(["residual", "--repo", "repo_x", "--storage-backend", "dynamodb"]) + + out = capsys.readouterr().out + assert exit_code == 0 + assert "repo: repo_x" in out + assert "branch main @ S2: 1 residual finding(s)" in out + assert "finding_residual" in out + assert "finding_stale" not in out # stale commit is not residual + + +def test_residual_cli_rejects_jsonl_backend(capsys): + exit_code = main(["residual", "--repo", "repo_x", "--storage-backend", "jsonl"]) + + assert exit_code == 2 + assert "dynamodb only" in capsys.readouterr().err