diff --git a/docs/views/source-scan-results-nosql-schema.md b/docs/views/source-scan-results-nosql-schema.md index 821ecc8..d18b578 100644 --- a/docs/views/source-scan-results-nosql-schema.md +++ b/docs/views/source-scan-results-nosql-schema.md @@ -20,9 +20,35 @@ Schema는 데이터 모양보다 질문에서 출발합니다. | --- | --- | --- | | `REPO_META` | 대상 repository의 현재 정보와 최근 스캔 요약 | 대상 목록 | | `SCAN_RUN` | 한 번의 스캔 실행 요약 | 실행 이력 | -| `FINDING` | 특정 실행에서 관측된 finding | 상세 검토 | +| `FINDING` | dedup된 finding identity | identity lookup | +| `FINDING_OBSERVATION` | 특정 scan run에서 관측된 finding snapshot | 상세 검토 | | `FINDING_STATE` | dedup된 finding의 lifecycle/triage 상태 | 재검토와 gate | -| `EVAL_RUN` | synthetic corpus 평가 결과 | 품질 추적 | + +## CORE item shape + +현재 구현 범위는 CORE row만 다룹니다. + +| Entity | PK | SK | 핵심 내용 | +| --- | --- | --- | --- | +| `REPO_META` | `REPO#` | `META` | repository metadata와 최근 스캔 요약 | +| `SCAN_RUN` | `REPO#` | `SCAN_RUN##` | scan run summary와 artifact pointer | +| `FINDING` | `FINDING#` | `META` | repo, rule, source tool, location, fingerprint 같은 identity field | +| `FINDING_OBSERVATION` | `RUN#` | `OBS##` | run-scoped finding snapshot | +| `FINDING_STATE` | `FINDING#` | `STATE#GLOBAL` | status와 triage lifecycle state | + +`FINDING` identity row에는 scanner evidence snapshot이나 triage state를 넣지 않습니다. +Scan run별 evidence snapshot은 `FINDING_OBSERVATION`에 두고, runtime read는 +observation snapshot에 `FINDING_STATE`를 overlay해서 `Finding`을 복원합니다. + +`occurrenceKey`는 redacted canonical observation identity의 deterministic hash입니다. +재료는 `repo`, `ruleId`, `sourceTool`, `file`, `startLine`, `fingerprint`를 기본으로 +하고, redacted fallback으로 `secretHash`, `matchHash`를 사용할 수 있습니다. Raw +secret이나 raw match 문자열은 occurrence key material에 넣지 않습니다. + +`FINDING_STATE`는 현재 `GLOBAL` scope만 사용합니다. Scan write는 state row가 없을 +때만 default state를 만들고, 이미 존재하는 manual triage verdict/verifier/reason을 +blind overwrite하지 않습니다. Observation write는 state와 분리되어 idempotent하게 +처리합니다. ## 조회 기준 @@ -30,8 +56,8 @@ Schema는 데이터 모양보다 질문에서 출발합니다. | --- | --- | | 최근 대상 목록 | repo list index를 page 단위로 조회 | | 대상별 스캔 이력 | 대상 partition에서 scan run만 조회 | -| 특정 실행의 finding | scan run partition에서 finding 조회 | -| finding 상태 | finding별 state item 조회 | +| 특정 실행의 finding | scan run partition에서 `OBS#` item 조회 | +| finding 상태 | finding별 `STATE#GLOBAL` item 조회 | | report/gate 판단 | finding snapshot에 lifecycle state를 merge한 뒤 계산 | ## 안전 규칙 @@ -42,6 +68,16 @@ Schema는 데이터 모양보다 질문에서 출발합니다. - 실제 외부 export, 비공개 finding, DB dump는 이 저장소 밖에 둡니다. - TTL, streams, transaction, 운영 DynamoDB behavior는 현재 기본 요구사항이 아닙니다. +## 현재 non-goals + +다음 row/table은 CORE schema split 범위가 아닙니다. + +- `FindingFingerprintMap` +- `ScanRunQueryRows` +- `PatternQueryRows` +- standalone Artifacts table/item +- TTL, streams, Lambda, production DynamoDB behavior + ## 로컬 실행 환경 Dynalite는 로컬 검증 후보입니다. DynamoDB Local과 LocalStack은 parity 또는 adapter integration을 확인할 때 검토할 수 있지만, 현재 기본 운영 결정은 아닙니다. diff --git a/src/security_scanner/storage/adapters/nosql_db/access.py b/src/security_scanner/storage/adapters/nosql_db/access.py index 695f89c..1fd628d 100644 --- a/src/security_scanner/storage/adapters/nosql_db/access.py +++ b/src/security_scanner/storage/adapters/nosql_db/access.py @@ -16,10 +16,15 @@ def items_to_findings(items: Iterable[dict[str, Any]]) -> list[Finding]: - """Return Finding objects from run-scoped finding items.""" + """Return Finding objects from observation items.""" findings: list[Finding] = [] for item in items: - if item.get("entityType") == "FINDING" and "finding" in item: + if ( + item.get("entityType") == "FINDING_OBSERVATION" + and "findingSnapshot" in item + ): + findings.append(Finding.from_dict(item["findingSnapshot"])) + elif item.get("entityType") == "FINDING" and "finding" in item: findings.append(Finding.from_dict(item["finding"])) return findings diff --git a/src/security_scanner/storage/adapters/nosql_db/items.py b/src/security_scanner/storage/adapters/nosql_db/items.py index 5de5637..03fe3cf 100644 --- a/src/security_scanner/storage/adapters/nosql_db/items.py +++ b/src/security_scanner/storage/adapters/nosql_db/items.py @@ -3,6 +3,8 @@ from __future__ import annotations import datetime as dt +import hashlib +import json from collections import Counter from dataclasses import dataclass, field from typing import Any, Iterable @@ -161,37 +163,92 @@ def scan_target_to_item(target: ScanTarget) -> dict[str, Any]: def scan_target_from_item(item: dict[str, Any]) -> ScanTarget: """Reconstruct a scan target from a table item.""" - return ScanTarget(url=item["url"], name=item["name"], enabled=bool(item.get("enabled", True))) + return ScanTarget( + url=item["url"], + name=item["name"], + enabled=bool(item.get("enabled", True)), + ) + + +STATE_SCOPE_GLOBAL = "GLOBAL" + + +def occurrence_key_for_finding(finding: Finding) -> str: + """Return a deterministic redacted observation occurrence key.""" + material: dict[str, Any] = { + "repo": finding.repo.full_name, + "ruleId": finding.rule_id, + "sourceTool": finding.source_tool, + "file": finding.location.file_path, + "startLine": finding.location.line_start, + "fingerprint": finding.fingerprint, + } + if not finding.fingerprint: + material["secretHash"] = finding.evidence.secret_hash + if finding.gitleaks and finding.gitleaks.match: + material["matchHash"] = hashlib.sha256( + finding.gitleaks.match.encode("utf-8") + ).hexdigest() + encoded = json.dumps(without_none(material), sort_keys=True, separators=(",", ":")) + digest = hashlib.sha256(encoded.encode("utf-8")).hexdigest()[:32] + return f"occ_{digest}" def finding_to_items(finding: Finding) -> list[dict[str, Any]]: - """Map one Finding into run-scoped and lifecycle state table items.""" + """Map one Finding into identity, observation, and lifecycle state items.""" now = now_iso() finding_id = finding.finding_id repo = finding.repo.full_name run_id = finding.scan.scan_run_id rule_id = finding.rule_id + occurrence_key = occurrence_key_for_finding(finding) - run_sort = f"FINDING#{repo}#{finding_id}" - run_item = { - "PK": f"RUN#{run_id}", - "SK": run_sort, + identity_item = { + "PK": f"FINDING#{finding_id}", + "SK": "META", "entityType": "FINDING", "gsi1pk": f"REPO#{repo}", - "gsi1sk": f"RUN#{run_id}#{run_sort}", + "gsi1sk": f"FINDING#{finding_id}", + "gsi2pk": f"RULE#{rule_id}", + "gsi2sk": f"FINDING#{finding_id}", + "createdAt": now, + "updatedAt": now, + "findingId": finding_id, + "repo": repo, + "ruleId": rule_id, + "sourceTool": finding.source_tool, + "sourceToolVersion": finding.source_tool_version, + "category": finding.category, + "severity": finding.severity, + "confidence": finding.confidence, + "file": finding.location.file_path, + "startLine": finding.location.line_start, + "fingerprint": finding.fingerprint, + } + observation_item = { + "PK": f"RUN#{run_id}", + "SK": f"OBS#{finding_id}#{occurrence_key}", + "entityType": "FINDING_OBSERVATION", + "gsi1pk": f"REPO#{repo}", + "gsi1sk": f"RUN#{run_id}#OBS#{finding_id}#{occurrence_key}", "gsi2pk": f"RULE#{rule_id}", - "gsi2sk": f"RUN#{run_id}#{repo}#{finding_id}", + "gsi2sk": f"RUN#{run_id}#{repo}#{finding_id}#{occurrence_key}", "createdAt": now, "updatedAt": now, "findingId": finding_id, + "scanRunId": run_id, + "occurrenceKey": occurrence_key, "repo": repo, "ruleId": rule_id, + "sourceTool": finding.source_tool, + "file": finding.location.file_path, + "startLine": finding.location.line_start, "fingerprint": finding.fingerprint, - "finding": finding.to_dict(), + "findingSnapshot": finding.to_dict(), } state_item = { "PK": f"FINDING#{finding_id}", - "SK": "STATE", + "SK": f"STATE#{STATE_SCOPE_GLOBAL}", "entityType": "FINDING_STATE", "gsi1pk": f"REPO#{repo}", "gsi1sk": f"FINDING#{finding_id}", @@ -200,13 +257,18 @@ def finding_to_items(finding: Finding) -> list[dict[str, Any]]: "createdAt": now, "updatedAt": now, "findingId": finding_id, + "stateScopeKey": STATE_SCOPE_GLOBAL, "repo": repo, "ruleId": rule_id, "fingerprint": finding.fingerprint, "status": finding.status, "triage": finding.triage.to_dict(), } - return [run_item, state_item] + return [ + without_none(identity_item), + without_none(observation_item), + without_none(state_item), + ] def scan_date(scan_at_iso: str) -> str: diff --git a/src/security_scanner/storage/adapters/nosql_db/store.py b/src/security_scanner/storage/adapters/nosql_db/store.py index df6405d..0c4a68d 100644 --- a/src/security_scanner/storage/adapters/nosql_db/store.py +++ b/src/security_scanner/storage/adapters/nosql_db/store.py @@ -18,6 +18,7 @@ ) from security_scanner.storage.adapters.nosql_db.items import ( RepoMetadata, + STATE_SCOPE_GLOBAL, ScanRunSummary, counts_by_category, finding_to_items, @@ -79,7 +80,10 @@ def ensure_table(self) -> None: def append(self, finding: Finding) -> None: for item in finding_to_items(finding): - self._table.put_item(Item=item) + if item.get("entityType") == "FINDING_STATE": + self._put_state_item_if_absent(item) + else: + self._table.put_item(Item=item) def extend(self, findings: Iterable[Finding]) -> None: for finding in findings: @@ -183,15 +187,13 @@ def read_for_scan_run(self, scan_run_id: str) -> list[Finding]: KeyConditionExpression="PK = :pk AND begins_with(SK, :sk_prefix)", ExpressionAttributeValues={ ":pk": f"RUN#{scan_run_id}", - ":sk_prefix": "FINDING#", + ":sk_prefix": "OBS#", }, ) findings = items_to_findings(items) - state_by_id = { - finding.finding_id: state - for finding in findings - if (state := self.read_finding_state(finding.finding_id)) is not None - } + state_by_id = self._batch_read_finding_states( + finding.finding_id for finding in findings + ) return merge_finding_states(findings, state_by_id) def read_finding_state(self, finding_id: str) -> dict[str, Any] | None: @@ -199,7 +201,7 @@ def read_finding_state(self, finding_id: str) -> dict[str, Any] | None: KeyConditionExpression="PK = :pk AND begins_with(SK, :sk_prefix)", ExpressionAttributeValues={ ":pk": f"FINDING#{finding_id}", - ":sk_prefix": "STATE", + ":sk_prefix": "STATE#", }, Limit=1, ) @@ -210,7 +212,7 @@ def read_all(self) -> list[Finding]: finding_items = scan_all_pages( self._table, FilterExpression="entityType = :entity_type", - ExpressionAttributeValues={":entity_type": "FINDING"}, + ExpressionAttributeValues={":entity_type": "FINDING_OBSERVATION"}, ) state_items = scan_all_pages( self._table, @@ -227,3 +229,52 @@ def clear(self) -> None: "DynamoDbCompatibleFindingStore refuses destructive clear(); " "delete/recreate the local table explicitly instead" ) + + def _put_state_item_if_absent(self, item: dict[str, Any]) -> None: + """Create lifecycle state only when manual triage has no row yet.""" + try: + self._table.put_item( + Item=item, + ConditionExpression="attribute_not_exists(PK) AND attribute_not_exists(SK)", + ) + except Exception as exc: + if _is_conditional_check_failure(exc): + return + raise + + def _batch_read_finding_states( + self, + finding_ids: Iterable[str], + ) -> dict[str, dict[str, Any]]: + """Fetch lifecycle state rows without one query per finding.""" + unique_finding_ids = list(dict.fromkeys(finding_ids)) + state_by_id: dict[str, dict[str, Any]] = {} + for start in range(0, len(unique_finding_ids), 100): + keys = [ + { + "PK": f"FINDING#{finding_id}", + "SK": f"STATE#{STATE_SCOPE_GLOBAL}", + } + for finding_id in unique_finding_ids[start : start + 100] + ] + request_items = {self.config.table_name: {"Keys": keys}} + while request_items: + response = self._resource.batch_get_item(RequestItems=request_items) + for item in response.get("Responses", {}).get( + self.config.table_name, + [], + ): + if item.get("entityType") == "FINDING_STATE": + state_by_id[item["findingId"]] = item + request_items = response.get("UnprocessedKeys", {}) + return state_by_id + + +def _is_conditional_check_failure(exc: Exception) -> bool: + """Return True for DynamoDB conditional-write conflicts.""" + response = getattr(exc, "response", None) + if isinstance(response, dict): + error = response.get("Error", {}) + if error.get("Code") == "ConditionalCheckFailedException": + return True + return exc.__class__.__name__ == "ConditionalCheckFailedException" diff --git a/tests/test_dynamodb_compatible_store.py b/tests/test_dynamodb_compatible_store.py index e3dc42b..9ce1c46 100644 --- a/tests/test_dynamodb_compatible_store.py +++ b/tests/test_dynamodb_compatible_store.py @@ -56,14 +56,24 @@ def _make(**overrides) -> Finding: class FakeDynamoTable: + class ConditionalCheckFailedException(Exception): + pass + def __init__(self, items: list[dict] | None = None) -> None: self.put_calls: list[dict] = [] + self.put_call_kwargs: list[dict] = [] self.query_calls: list[dict] = [] self.scan_calls: list[dict] = [] self.wait_calls = 0 self.items = items or [] - def put_item(self, *, Item: dict) -> dict: + def put_item(self, *, Item: dict, **kwargs) -> dict: + self.put_call_kwargs.append({"Item": Item, **kwargs}) + if kwargs.get("ConditionExpression") and any( + item.get("PK") == Item["PK"] and item.get("SK") == Item["SK"] + for item in self.items + ): + raise self.ConditionalCheckFailedException("conditional check failed") self.put_calls.append(Item) self.items.append(Item) return {"ResponseMetadata": {"HTTPStatusCode": 200}} @@ -132,11 +142,25 @@ class FakeDynamoResource: def __init__(self, table: FakeDynamoTable) -> None: self.table = table self.table_names: list[str] = [] + self.batch_get_calls: list[dict] = [] def Table(self, table_name: str) -> FakeDynamoTable: # noqa: N802 - boto3 API self.table_names.append(table_name) return self.table + def batch_get_item(self, *, RequestItems: dict) -> dict: # noqa: N802 - boto3 API + self.batch_get_calls.append(RequestItems) + responses = {} + for table_name, request in RequestItems.items(): + keys = request["Keys"] + responses[table_name] = [ + item + for key in keys + for item in self.table.items + if item.get("PK") == key["PK"] and item.get("SK") == key["SK"] + ] + return {"Responses": responses} + class FakeDynamoClient: class exceptions: @@ -176,7 +200,7 @@ def test_build_table_schema_matches_single_table_keys_and_indexes(): } == {"PK", "SK", "gsi1pk", "gsi1sk", "gsi2pk", "gsi2sk"} -def test_finding_to_items_writes_run_item_and_state_item_with_gitleaks_payload(): +def test_finding_to_items_writes_core_identity_observation_and_state_items(): finding = _make( triage_verdict=Verdict.TRUE_POSITIVE.value, gitleaks=GitleaksFindingPayload( @@ -191,18 +215,56 @@ def test_finding_to_items_writes_run_item_and_state_item_with_gitleaks_payload() items = finding_to_items(finding) - assert [item["entityType"] for item in items] == ["FINDING", "FINDING_STATE"] - run_item = items[0] - state_item = items[1] - assert run_item["PK"] == f"RUN#{SCAN_RUN_ID}" - assert run_item["SK"] == f"FINDING#{finding.repo.full_name}#{finding.finding_id}" - assert run_item["gsi1pk"] == f"REPO#{finding.repo.full_name}" - assert run_item["gsi2pk"] == f"RULE#{finding.rule_id}" - assert run_item["finding"]["findingId"] == finding.finding_id - assert run_item["finding"]["gitleaks"]["secret"] == FAKE_SECRET - assert run_item["finding"]["gitleaks"]["match"] == f"token={FAKE_SECRET}" + assert [item["entityType"] for item in items] == [ + "FINDING", + "FINDING_OBSERVATION", + "FINDING_STATE", + ] + identity_item = items[0] + observation_item = items[1] + state_item = items[2] + assert identity_item["PK"] == f"FINDING#{finding.finding_id}" + assert identity_item["SK"] == "META" + assert identity_item["gsi1pk"] == f"REPO#{finding.repo.full_name}" + assert identity_item["gsi2pk"] == f"RULE#{finding.rule_id}" + assert identity_item["sourceTool"] == "gitleaks" + assert "sourceToolVersion" not in identity_item + assert "finding" not in identity_item + assert "findingSnapshot" not in identity_item + assert "triage" not in identity_item + assert FAKE_SECRET not in str(identity_item) + assert observation_item["PK"] == f"RUN#{SCAN_RUN_ID}" + assert observation_item["SK"].startswith(f"OBS#{finding.finding_id}#occ_") + assert observation_item["entityType"] == "FINDING_OBSERVATION" + assert observation_item["findingSnapshot"]["findingId"] == finding.finding_id + assert observation_item["findingSnapshot"]["gitleaks"]["secret"] == FAKE_SECRET + assert observation_item["findingSnapshot"]["gitleaks"]["match"] == ( + f"token={FAKE_SECRET}" + ) + assert FAKE_SECRET not in observation_item["occurrenceKey"] + assert f"token={FAKE_SECRET}" not in observation_item["occurrenceKey"] + repeat_observation_item = finding_to_items(finding)[1] + assert repeat_observation_item["occurrenceKey"] == observation_item["occurrenceKey"] + same_fingerprint_data = finding.to_dict() + same_fingerprint_data["evidence"]["secretHash"] = "salted-sha256:different" + same_fingerprint_data["gitleaks"]["match"] = "token=AKIAFAKEEXAMPLE111111" + same_fingerprint_observation = finding_to_items( + Finding.from_dict(same_fingerprint_data) + )[1] + assert same_fingerprint_observation["occurrenceKey"] == observation_item[ + "occurrenceKey" + ] assert state_item["PK"] == f"FINDING#{finding.finding_id}" - assert state_item["SK"] == "STATE" + assert state_item["SK"] == "STATE#GLOBAL" + assert state_item["stateScopeKey"] == "GLOBAL" + assert {item["entityType"] for item in items}.isdisjoint( + { + "FindingFingerprintMap", + "ScanRunQueryRows", + "PatternQueryRows", + "ARTIFACT", + } + ) def test_repo_metadata_to_item_keeps_runtime_metadata_and_repo_list_index(): @@ -329,14 +391,15 @@ def test_write_scan_result_persists_findings_metadata_and_summary(tmp_path): assert [item["entityType"] for item in table.put_calls] == [ "FINDING", + "FINDING_OBSERVATION", "FINDING_STATE", "REPO_META", "SCAN_RUN", ] - assert table.put_calls[2]["repoKey"] == "fake-org/fake-repo" - assert table.put_calls[2]["latestCountsTotal"] == 1 - assert table.put_calls[3]["scanRunId"] == SCAN_RUN_ID - assert table.put_calls[3]["countsTotal"] == 1 + assert table.put_calls[3]["repoKey"] == "fake-org/fake-repo" + assert table.put_calls[3]["latestCountsTotal"] == 1 + assert table.put_calls[4]["scanRunId"] == SCAN_RUN_ID + assert table.put_calls[4]["countsTotal"] == 1 def test_append_puts_finding_and_state_items(): @@ -351,8 +414,51 @@ def test_append_puts_finding_and_state_items(): assert [item["entityType"] for item in table.put_calls] == [ "FINDING", + "FINDING_OBSERVATION", "FINDING_STATE", ] + assert table.put_call_kwargs[2]["ConditionExpression"] == ( + "attribute_not_exists(PK) AND attribute_not_exists(SK)" + ) + + +def test_append_does_not_overwrite_existing_manual_finding_state(): + original = _make(triage_verdict=Verdict.NEEDS_REVIEW.value) + _, _, existing_state = finding_to_items(original) + existing_state = { + **existing_state, + "status": Status.FALSE_POSITIVE.value, + "triage": { + "verdict": Verdict.FALSE_POSITIVE.value, + "verifier": "synthetic-reviewer", + "reason": "manual triage is protected", + }, + } + table = FakeDynamoTable([existing_state]) + store = DynamoDbCompatibleFindingStore( + DynamoDbCompatibleConfig(table_name="SecurityScannerLocal"), + resource=FakeDynamoResource(table), + client=FakeDynamoClient(), + ) + scan_finding = _make( + triage_verdict=Verdict.TRUE_POSITIVE.value, + triage_verifier="synthetic-scan", + triage_reason="scan default must not overwrite manual state", + ) + + store.append(scan_finding) + + state_items = [ + item + for item in table.items + if item.get("PK") == f"FINDING#{scan_finding.finding_id}" + and item.get("SK") == "STATE#GLOBAL" + ] + assert state_items == [existing_state] + assert [item["entityType"] for item in table.put_calls] == [ + "FINDING", + "FINDING_OBSERVATION", + ] def test_store_puts_repo_metadata_and_scan_run_summary(): @@ -611,13 +717,16 @@ def test_read_for_scan_run_roundtrips_only_findings_for_that_run(): finding = _make() other_run = _make(scan_run_id="scan_other1", line_start=11) table = FakeDynamoTable(finding_to_items(finding) + finding_to_items(other_run)) + resource = FakeDynamoResource(table) store = DynamoDbCompatibleFindingStore( DynamoDbCompatibleConfig(table_name="SecurityScannerLocal"), - resource=FakeDynamoResource(table), + resource=resource, client=FakeDynamoClient(), ) assert store.read_for_scan_run(SCAN_RUN_ID) == [finding] + assert len(table.query_calls) == 1 + assert len(resource.batch_get_calls) == 1 def test_read_for_scan_run_paginates_query_results(): @@ -630,13 +739,13 @@ def __init__(self) -> None: self.run_query_calls: list[dict] = [] self.run_pages = [ { - "Items": [finding_to_items(first)[0]], + "Items": [finding_to_items(first)[1]], "LastEvaluatedKey": { "PK": "RUN#scan_m2test1", - "SK": "FINDING#page-break", + "SK": "OBS#page-break", }, }, - {"Items": [finding_to_items(second)[0]]}, + {"Items": [finding_to_items(second)[1]]}, ] def query(self, **kwargs) -> dict: @@ -658,14 +767,41 @@ def query(self, **kwargs) -> dict: return page table = PaginatedRunAndStateTable() + resource = FakeDynamoResource(table) store = DynamoDbCompatibleFindingStore( DynamoDbCompatibleConfig(table_name="SecurityScannerLocal"), - resource=FakeDynamoResource(table), + resource=resource, client=FakeDynamoClient(), ) assert store.read_for_scan_run(SCAN_RUN_ID) == [first, second] assert len(table.run_query_calls) == 2 + assert len(resource.batch_get_calls) == 1 + + +def test_read_for_scan_run_batches_state_lookup_in_chunks_of_100(): + findings = [_make(line_start=line_start) for line_start in range(10, 111)] + table = FakeDynamoTable( + [item for finding in findings for item in finding_to_items(finding)] + ) + resource = FakeDynamoResource(table) + store = DynamoDbCompatibleFindingStore( + DynamoDbCompatibleConfig(table_name="SecurityScannerLocal"), + resource=resource, + client=FakeDynamoClient(), + ) + + read_findings = store.read_for_scan_run(SCAN_RUN_ID) + + assert {finding.finding_id for finding in read_findings} == { + finding.finding_id for finding in findings + } + assert len(table.query_calls) == 1 + assert len(resource.batch_get_calls) == 2 + assert [ + len(call["SecurityScannerLocal"]["Keys"]) + for call in resource.batch_get_calls + ] == [100, 1] def test_read_all_scans_finding_items_only(): @@ -701,7 +837,7 @@ def test_read_all_merges_lifecycle_state_into_finding_snapshot(): fingerprint="gitleaks-fp-001", ), ) - run_item, state_item = finding_to_items(finding) + _, observation_item, state_item = finding_to_items(finding) state_item = { **state_item, "status": Status.RESOLVED.value, @@ -711,7 +847,7 @@ def test_read_all_merges_lifecycle_state_into_finding_snapshot(): "reason": "synthetic test fixture", }, } - table = FakeDynamoTable([run_item, state_item]) + table = FakeDynamoTable([observation_item, state_item]) store = DynamoDbCompatibleFindingStore( DynamoDbCompatibleConfig(table_name="SecurityScannerLocal"), resource=FakeDynamoResource(table), @@ -732,7 +868,7 @@ def test_read_all_merges_lifecycle_state_into_finding_snapshot(): def test_read_all_paginates_scan_results(): first = _make(line_start=10) second = _make(line_start=11) - page_key = {"PK": "RUN#scan_m2test1", "SK": "FINDING#page-break"} + page_key = {"PK": "RUN#scan_m2test1", "SK": "OBS#page-break"} class PaginatedScanTable(FakeDynamoTable): def scan(self, **kwargs) -> dict: @@ -740,14 +876,14 @@ def scan(self, **kwargs) -> dict: entity_type = kwargs["ExpressionAttributeValues"][":entity_type"] if entity_type == "FINDING_STATE": return {"Items": []} - assert entity_type == "FINDING" + assert entity_type == "FINDING_OBSERVATION" if "ExclusiveStartKey" not in kwargs: return { - "Items": [finding_to_items(first)[0]], + "Items": [finding_to_items(first)[1]], "LastEvaluatedKey": page_key, } assert kwargs["ExclusiveStartKey"] == page_key - return {"Items": [finding_to_items(second)[0]]} + return {"Items": [finding_to_items(second)[1]]} table = PaginatedScanTable() store = DynamoDbCompatibleFindingStore( @@ -815,7 +951,8 @@ def test_read_finding_state_returns_lifecycle_state_item(): assert state is not None assert state["entityType"] == "FINDING_STATE" assert state["PK"] == f"FINDING#{finding.finding_id}" - assert state["SK"] == "STATE" + assert state["SK"] == "STATE#GLOBAL" + assert state["stateScopeKey"] == "GLOBAL" assert state["status"] == finding.status assert state["triage"]["verdict"] == Verdict.TRUE_POSITIVE.value diff --git a/tests/test_nosql_db_adapter.py b/tests/test_nosql_db_adapter.py index 9f621f9..cb403cd 100644 --- a/tests/test_nosql_db_adapter.py +++ b/tests/test_nosql_db_adapter.py @@ -49,8 +49,15 @@ def test_transport_exposes_config_and_table_schema(): def test_items_exposes_finding_item_mapping(): items = finding_to_items(_finding()) - assert [item["entityType"] for item in items] == ["FINDING", "FINDING_STATE"] - assert items[0]["PK"] == f"RUN#{SCAN_RUN_ID}" + assert [item["entityType"] for item in items] == [ + "FINDING", + "FINDING_OBSERVATION", + "FINDING_STATE", + ] + assert items[0]["SK"] == "META" + assert items[1]["PK"] == f"RUN#{SCAN_RUN_ID}" + assert items[1]["SK"].startswith(f"OBS#{items[0]['findingId']}#occ_") + assert items[2]["SK"] == "STATE#GLOBAL" def test_access_query_all_pages_honors_limit():