-
Notifications
You must be signed in to change notification settings - Fork 0
feat(incremental): branch-aware residual + scan-worker daemon (#12) #22
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,138 @@ | ||
| """Per-branch residual computation for incremental scanning (issue #12). | ||
|
|
||
| Design (grill L1/L2): finding status/disposition is GLOBAL; branch is an | ||
| occurrence dimension only. "Residual on branch B" is DERIVED, not stored: a | ||
| finding is residual on B when it appears in an observation at B's latest scanned | ||
| commit (``RefState.last_seen_sha``). No new GSI — callers pass observations read | ||
| within the ``REPO#<repo>`` partition. | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from collections.abc import Iterable, Mapping | ||
| from dataclasses import dataclass | ||
| from typing import Any, Protocol | ||
|
|
||
| from security_scanner.core.finding.model import Finding | ||
| from security_scanner.storage.base import RefState | ||
|
|
||
|
|
||
| def branch_from_ref(ref_name: str | None) -> str | None: | ||
| """Derive a short branch name from a git ref. | ||
|
|
||
| branch never enters finding identity (L1). Tags resolve to None since they | ||
| are not branches. | ||
| """ | ||
| if not ref_name: | ||
| return None | ||
| if ref_name.startswith("refs/heads/"): | ||
| return ref_name[len("refs/heads/") :] | ||
| if ref_name.startswith("refs/remotes/"): | ||
| rest = ref_name[len("refs/remotes/") :] | ||
| parts = rest.split("/", 1) | ||
| return parts[1] if len(parts) == 2 else parts[0] | ||
| if ref_name.startswith("refs/tags/"): | ||
| return None | ||
| return ref_name | ||
|
|
||
|
|
||
| def finding_with_context( | ||
| finding: Finding, *, commit: str | None, branch: str | None | ||
| ) -> Finding: | ||
| """Tag a finding with scan-context commit/branch (occurrence, not identity). | ||
|
|
||
| None values never clobber existing context; identity (finding_id) is | ||
| unaffected because branch/commit are not part of the fingerprint (L1). | ||
| """ | ||
| if commit is None and branch is None: | ||
| return finding | ||
| if finding.repo.commit == commit and finding.repo.branch == branch: | ||
| return finding | ||
| data = finding.to_dict() | ||
| repo = dict(data["repo"]) | ||
| if commit is not None: | ||
| repo["commit"] = commit | ||
| if branch is not None: | ||
| repo["branch"] = branch | ||
| data["repo"] = repo | ||
| return Finding.from_dict(data) | ||
|
pureliture marked this conversation as resolved.
|
||
|
|
||
|
|
||
| @dataclass(frozen=True) | ||
| class BranchResidual: | ||
| """Findings still present on a branch at its latest scanned commit.""" | ||
|
|
||
| branch: str | ||
| commit: str | ||
| finding_ids: list[str] | ||
|
|
||
|
|
||
| def residual_by_branch( | ||
| ref_states: Iterable[RefState], | ||
| observations: Iterable[Mapping[str, Any]], | ||
| ) -> list[BranchResidual]: | ||
| """Compute residual findings per branch. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| ref_states: | ||
| Latest known ref states for one repository (``last_seen_sha`` per ref). | ||
| observations: | ||
| Observation records (top-level ``branch`` / ``commit`` / ``findingId``) | ||
| read within the repository partition. | ||
|
|
||
| Returns one ``BranchResidual`` per ref that resolves to a branch (tags | ||
| skipped), ordered by branch name. A branch with no matching observation | ||
| yields an empty ``finding_ids`` list. | ||
| """ | ||
| # Index observations by commit once (O(M)). Matching is on commit only: a | ||
| # commit is scanned once and its observation carries the first ref's branch, | ||
| # but a commit reachable from several refs is residual on every ref whose tip | ||
| # is that commit (branch is occurrence, derived from the ref). | ||
| finding_ids_by_commit: dict[str, list[str]] = {} | ||
| for obs in observations: | ||
| commit = obs.get("commit") | ||
| finding_id = obs.get("findingId") | ||
| if commit is None or finding_id is None: | ||
| continue | ||
| finding_ids_by_commit.setdefault(commit, []).append(finding_id) | ||
|
|
||
| results: list[BranchResidual] = [] | ||
| for ref in ref_states: # O(N) | ||
| branch = branch_from_ref(ref.ref_name) | ||
| if branch is None: | ||
| continue | ||
| ids = finding_ids_by_commit.get(ref.last_seen_sha, []) | ||
| results.append( | ||
| BranchResidual( | ||
| branch=branch, | ||
| commit=ref.last_seen_sha, | ||
| finding_ids=sorted(set(ids)), | ||
| ) | ||
| ) | ||
| return sorted(results, key=lambda r: r.branch) | ||
|
|
||
|
|
||
| class _ResidualStore(Protocol): | ||
| def list_ref_states(self, repo_id: str) -> list[RefState]: | ||
| """Return ref states for the repository.""" | ||
|
|
||
| def read_observations_for_repo(self, repo_id: str) -> list[Mapping[str, Any]]: | ||
| """Return observation records for the repository.""" | ||
|
|
||
|
|
||
| def residual_for_repo(store: _ResidualStore, repo_id: str) -> list[BranchResidual]: | ||
| """End-to-end per-branch residual for one repository. | ||
|
|
||
| Reads ref states + observations within the repo partition and derives | ||
| residual. Status/disposition stays global (L1); this is a derived view. | ||
|
|
||
| NOTE: meaningful only for incrementally-scanned repos, where REF_STATE rows | ||
| and observation ``gsi1pk`` are both keyed by the same ``repo_id`` (the | ||
| scan-worker path sets ``repo_full_name == repo_id``). For local_scan-only | ||
| repos there are no REF_STATE rows, so this returns an empty list. | ||
| """ | ||
| return residual_by_branch( | ||
| store.list_ref_states(repo_id), | ||
| store.read_observations_for_repo(repo_id), | ||
| ) | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.