diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..1c80c9f --- /dev/null +++ b/.dockerignore @@ -0,0 +1,51 @@ +.git +.gitignore +.dockerignore + +private +targets.local.yaml +targets.local.yml +*.local.yaml +*.local.yml + +.env +.env.* +*.pem +*.key +*.pfx +credentials +credentials.json +*.tfvars + +.venv +venv +.uv +__pycache__ +.pytest_cache +.mypy_cache +.ruff_cache +coverage +.coverage +htmlcov +dist +build +*.egg-info + +findings +reports +graphify-out +graphify* +*graphify* +.scanner +workspaces +*.log +*.db +*.sqlite +*.sarif +gitleaks-report.json +*.report.json + +.DS_Store +.idea +.vscode +.worktrees diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..68203df --- /dev/null +++ b/Dockerfile @@ -0,0 +1,53 @@ +FROM python:3.13-slim + +ARG GH_VERSION=2.94.0 +ARG GLAB_VERSION=1.102.0 +ARG GITLEAKS_VERSION=8.24.0 +ARG TARGETARCH + +ENV PYTHONUNBUFFERED=1 \ + UV_SYSTEM_PYTHON=1 + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y --no-install-recommends ca-certificates curl git \ + && rm -rf /var/lib/apt/lists/* + +RUN set -eux; \ + arch="${TARGETARCH:-$(dpkg --print-architecture)}"; \ + case "$arch" in \ + amd64) gh_arch="amd64"; glab_arch="amd64"; gitleaks_arch="x64" ;; \ + arm64) gh_arch="arm64"; glab_arch="arm64"; gitleaks_arch="arm64" ;; \ + *) echo "unsupported gitleaks architecture: $arch" >&2; exit 1 ;; \ + esac; \ + curl -fsSL \ + "https://github.com/cli/cli/releases/download/v${GH_VERSION}/gh_${GH_VERSION}_linux_${gh_arch}.tar.gz" \ + -o /tmp/gh.tar.gz; \ + tar -xzf /tmp/gh.tar.gz -C /tmp; \ + mv "/tmp/gh_${GH_VERSION}_linux_${gh_arch}/bin/gh" /usr/local/bin/gh; \ + rm -rf /tmp/gh.tar.gz "/tmp/gh_${GH_VERSION}_linux_${gh_arch}"; \ + gh --version; \ + curl -fsSL \ + "https://gitlab.com/gitlab-org/cli/-/releases/v${GLAB_VERSION}/downloads/glab_${GLAB_VERSION}_linux_${glab_arch}.tar.gz" \ + -o /tmp/glab.tar.gz; \ + tar -xzf /tmp/glab.tar.gz -C /tmp bin/glab; \ + mv /tmp/bin/glab /usr/local/bin/glab; \ + rm -rf /tmp/glab.tar.gz /tmp/bin; \ + glab --version; \ + curl -fsSL \ + "https://github.com/gitleaks/gitleaks/releases/download/v${GITLEAKS_VERSION}/gitleaks_${GITLEAKS_VERSION}_linux_${gitleaks_arch}.tar.gz" \ + -o /tmp/gitleaks.tar.gz; \ + tar -xzf /tmp/gitleaks.tar.gz -C /usr/local/bin gitleaks; \ + chmod +x /usr/local/bin/gitleaks; \ + rm /tmp/gitleaks.tar.gz; \ + gitleaks version + +COPY pyproject.toml uv.lock README.md ./ +COPY src ./src + +RUN pip install --no-cache-dir uv \ + && uv pip install --system . + +ENTRYPOINT ["security-scanner"] +CMD ["--help"] diff --git a/README.md b/README.md index f338682..3633dbe 100644 --- a/README.md +++ b/README.md @@ -73,10 +73,36 @@ uv run security-scanner gate --findings private/findings.jsonl --max 0 DynamoDB-compatible backend는 로컬에서 조회 패턴을 검증하기 위한 저장소입니다. 관리형 저장소 연동은 현재 지원 범위가 아닙니다. -로컬 DB는 저장소에 포함된 `docker-compose.yml`의 Dynalite 컨테이너로 띄웁니다. `http://localhost:4567`에서 응답합니다(로컬 검증 전용, 컨테이너를 내리면 데이터 소멸). +로컬 DB는 저장소에 포함된 `docker-compose.yml`의 DynamoDB Local 컨테이너로 +띄웁니다. Host에서는 `http://localhost:4567`에서 응답하고, 데이터는 named +Compose volume에 유지됩니다. ```bash -docker compose up -d dynalite +docker compose up -d dynamodb-local +``` + +Host의 `4567` 포트가 이미 사용 중이면 `SECURITY_SCANNER_DYNAMO_HOST_PORT`로 +바꿔 띄울 수 있습니다. Worker 컨테이너는 compose 내부 endpoint를 사용하므로 +그대로 동작합니다. + +```bash +SECURITY_SCANNER_DYNAMO_HOST_PORT=14567 docker compose up -d dynamodb-local +``` + +새 PC에서 public HTTPS repo 하나를 바로 검증하려면 Docker 경로를 사용할 수 있습니다. + +```bash +SECURITY_SCANNER_QUICKSTART_TARGET=https://github.com// \ + docker compose up --build --abort-on-container-exit --exit-code-from worker worker +``` + +커스텀 GitLab 도메인은 URL만으로 provider를 판별할 수 없으므로 provider hint를 +함께 지정합니다. + +```bash +SECURITY_SCANNER_QUICKSTART_TARGET=https://source.example.test// \ +SECURITY_SCANNER_SCM_PROVIDER=gitlab \ + docker compose up --build --abort-on-container-exit --exit-code-from worker worker ``` ```bash @@ -101,7 +127,7 @@ uv run security-scanner gate \ 스캔을 실행하면 `Scan run ID`가 출력됩니다. 특정 실행 결과만 보고 싶으면 그 값을 `--scan-run-id`로 넘깁니다. 저장소 전체를 대상으로 판단할 때만 생략합니다. -카탈로그(`add-target`)에 등록한 여러 저장소를 한 번에 스캔하는 `scan-all` 흐름은 [시작하기 가이드의 "주기 스캔 로컬 테스트"](docs/views/getting-started.md#주기-스캔-로컬-테스트-dynalite--scan-all) 절을 참고합니다. +카탈로그(`add-target`)에 등록한 여러 저장소를 한 번에 스캔하는 `scan-all` 흐름은 [시작하기 가이드의 "주기 스캔 로컬 테스트"](docs/views/getting-started.md#주기-스캔-로컬-테스트-dynamodb-local--scan-all) 절을 참고합니다. Schema와 조회 기준은 [소스 스캔 결과 NoSQL Schema](docs/views/source-scan-results-nosql-schema.md)에 정리되어 있습니다. diff --git a/deploy/systemd/README.md b/deploy/systemd/README.md index aa15e42..caf0fe9 100644 --- a/deploy/systemd/README.md +++ b/deploy/systemd/README.md @@ -38,7 +38,7 @@ On the target Ubuntu host: - `gh` and `glab` CLIs installed and reachable on `PATH`. Required for GitHub/GitLab clone/fetch (spec §6). - A reachable DynamoDB-compatible backend on `http://localhost:4567`. For a - single host, run the Dynalite container shipped in the repo's + single host, run the DynamoDB Local container shipped in the repo's `docker-compose.yml` (see "Start the local DB" below). This requires Docker with the Compose v2 plugin. - A non-root service user (`scanner` by default) owning: @@ -54,11 +54,11 @@ sudo install -d -o scanner -g scanner /var/log/security-scanner sudo install -d -o scanner -g scanner /var/cache/security-scanner ``` -**Start the local DB.** From the project tree, bring up Dynalite and create the -table (and its query index) once: +**Start the local DB.** From the project tree, bring up DynamoDB Local and +create the table (and its query index) once: ```bash -docker compose up -d dynalite +docker compose up -d dynamodb-local uv run security-scanner init-storage \ --storage-backend dynamodb \ @@ -66,11 +66,37 @@ uv run security-scanner init-storage \ --dynamodb-table security_scanner_local_dev ``` -Dynalite here is for single-host, local-only use; its data is in-memory and is -lost if the container is removed. Register scan targets with `add-target` -before the first scheduled run — see the +If host port `4567` is already in use on a test box, set +`SECURITY_SCANNER_DYNAMO_HOST_PORT=` for the compose command. The +worker service still talks to DynamoDB Local through the compose network. + +DynamoDB Local here is for single-host, local-only use; its data is persisted +in the named Compose volume. Register scan targets with `add-target` before +the first scheduled run — see the [getting-started guide](../../docs/views/getting-started.md). Managed DynamoDB -and DynamoDB Local are out of scope for now. +is out of scope for now. + +**Incremental worker local proof.** The repository's Docker Compose file also +contains a `worker` service. It is for local verification only, not a production deployment target. + +```bash +SECURITY_SCANNER_QUICKSTART_TARGET=https://github.com// \ + docker compose up --abort-on-container-exit --exit-code-from worker worker +``` + +For custom GitLab domains, add the provider hint: + +```bash +SECURITY_SCANNER_QUICKSTART_TARGET=https://source.example.test// \ +SECURITY_SCANNER_SCM_PROVIDER=gitlab \ + docker compose up --abort-on-container-exit --exit-code-from worker worker +``` + +That command exercises `security-scanner quickstart` against the local DynamoDB +Local service, creates a current-tip queue job, and runs the worker. Keep +credentials out of compose files and inject them through the host environment +or the normal service manager only when you intentionally test private +repository access. --- @@ -83,8 +109,8 @@ For private repos, pass SCM tokens to the service. Two options. ```bash sudo install -d -m 700 /etc/security-scanner sudo tee /etc/security-scanner/scm.env >/dev/null <<'EOF' -GH_TOKEN=ghp_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx -GITLAB_TOKEN=glpat-xxxxxxxxxxxxxxxxxxxx +GH_TOKEN= +GITLAB_TOKEN= EOF sudo chmod 600 /etc/security-scanner/scm.env sudo chown scanner:scanner /etc/security-scanner/scm.env @@ -132,14 +158,14 @@ the user manager reads without a desktop keyring). Prereqs (as the scanning user): the project checked out at `~/security-scanner` with `uv sync` run; `uv`/`git`/`gitleaks`/`gh`/`docker` on `PATH`; the local -Dynalite DB reachable (the unit's `ExecStartPre` brings it up). +DynamoDB Local DB reachable (the unit's `ExecStartPre` brings it up). ```bash # 1. Authenticate gh (token never leaves the host). gh auth login # GitHub.com → HTTPS → paste a read-scoped token -# 2. Bootstrap the catalog table once (Dynalite must be up). -docker compose up -d dynalite +# 2. Bootstrap the catalog table once (DynamoDB Local must be up). +docker compose up -d dynamodb-local export SECURITY_SCANNER_STORAGE_BACKEND=dynamodb uv run security-scanner init-storage diff --git a/deploy/systemd/user/security-scanner-scan-all.service b/deploy/systemd/user/security-scanner-scan-all.service index a01500c..99efa1a 100644 --- a/deploy/systemd/user/security-scanner-scan-all.service +++ b/deploy/systemd/user/security-scanner-scan-all.service @@ -9,7 +9,7 @@ Type=oneshot # home directory, so this unit is portable across operators without edits. WorkingDirectory=%h/security-scanner -# DynamoDB-compatible backend (Dynalite via the repo's docker-compose.yml). +# DynamoDB-compatible backend (DynamoDB Local via the repo's docker-compose.yml). Environment=SECURITY_SCANNER_STORAGE_BACKEND=dynamodb Environment=SECURITY_SCANNER_DYNAMO_ENDPOINT=http://localhost:4567 Environment=SECURITY_SCANNER_DYNAMO_TABLE=security_scanner_local_dev @@ -20,8 +20,8 @@ Environment=SECURITY_SCANNER_DYNAMO_TABLE=security_scanner_local_dev # EnvironmentFile=-%h/.config/security-scanner/scm.env # Ensure the local DB container is up before scanning (no-op if already running). -# Drop this line if you manage Dynalite separately. -ExecStartPre=-/usr/bin/docker compose up -d dynalite +# Drop this line if you manage DynamoDB Local separately. +ExecStartPre=-/usr/bin/docker compose up -d dynamodb-local # Adjust the uv path if it lives elsewhere (`command -v uv`). ExecStart=%h/.local/bin/uv run security-scanner scan-all \ diff --git a/docker-compose.yml b/docker-compose.yml index bbcce26..58b9e95 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,33 +1,58 @@ -# Local-only DynamoDB-compatible backend for security-scanner. +# Local-only DynamoDB Local backend and turnkey worker for security-scanner. # -# Starts Dynalite on http://localhost:4567 — the default endpoint used by the -# `dynamodb` storage backend (see README "로컬 NoSQL 저장소" and -# docs/views/getting-started.md "주기 스캔 로컬 테스트"). +# Host CLI endpoint: http://localhost:4567 +# Container endpoint: http://dynamodb-local:8000 # -# Scope: local verification of the `scan-all` catalog/query path ONLY. -# Not production. Data is in-memory and is lost when the container stops. -# Managed DynamoDB / DynamoDB Local are intentionally out of scope for now. +# Scope: local verification of scan-all and incremental queue paths. +# Not production. Data is persisted in the named local Compose volume. services: - dynalite: - image: node:20-alpine - container_name: security-scanner-dynalite - # In-memory store (no --path) → pure-JS memdown, no native build needed. - command: ["npx", "--yes", "dynalite@3", "--port", "4567"] + dynamodb-local: + image: amazon/dynamodb-local:2.6.1 + user: root + working_dir: /home/dynamodblocal + command: + - "-jar" + - "DynamoDBLocal.jar" + - "-sharedDb" + - "-dbPath" + - "./data" ports: - - "4567:4567" + - "${SECURITY_SCANNER_DYNAMO_HOST_PORT:-4567}:8000" volumes: - # Cache the npx download so restarts don't re-fetch dynalite. - - dynalite-npm:/root/.npm - healthcheck: - test: - - "CMD" - - "node" - - "-e" - - "require('net').connect(4567,'127.0.0.1').on('connect',()=>process.exit(0)).on('error',()=>process.exit(1))" - interval: 5s - timeout: 3s - retries: 5 + - dynamodb-local-data:/home/dynamodblocal/data restart: unless-stopped + worker: + build: . + depends_on: + - dynamodb-local + environment: + SECURITY_SCANNER_STORAGE_BACKEND: dynamodb + SECURITY_SCANNER_DYNAMO_ENDPOINT: http://dynamodb-local:8000 + SECURITY_SCANNER_DYNAMO_TABLE: SecurityScannerLocal + SECURITY_SCANNER_QUICKSTART_TARGET: ${SECURITY_SCANNER_QUICKSTART_TARGET:-} + SECURITY_SCANNER_QUICKSTART_NAME: ${SECURITY_SCANNER_QUICKSTART_NAME:-quickstart-target} + SECURITY_SCANNER_SCM_PROVIDER: ${SECURITY_SCANNER_SCM_PROVIDER:-auto} + volumes: + - repo-cache:/root/.cache/security-scanner/repos + entrypoint: + - /bin/sh + - -lc + command: + - | + test -n "$$SECURITY_SCANNER_QUICKSTART_TARGET" || { + echo "Set SECURITY_SCANNER_QUICKSTART_TARGET to a public HTTPS repo URL." >&2 + exit 2 + } + security-scanner quickstart "$$SECURITY_SCANNER_QUICKSTART_TARGET" \ + --name "$$SECURITY_SCANNER_QUICKSTART_NAME" \ + --storage-backend dynamodb \ + --dynamodb-endpoint-url "$$SECURITY_SCANNER_DYNAMO_ENDPOINT" \ + --dynamodb-table "$$SECURITY_SCANNER_DYNAMO_TABLE" \ + --scm-provider "$$SECURITY_SCANNER_SCM_PROVIDER" \ + --storage-wait-seconds 60 \ + --max-jobs 10 + volumes: - dynalite-npm: + dynamodb-local-data: + repo-cache: diff --git a/docs/views/getting-started.md b/docs/views/getting-started.md index e00c950..fe08547 100644 --- a/docs/views/getting-started.md +++ b/docs/views/getting-started.md @@ -206,13 +206,13 @@ Ollama 응답 실패, timeout, 낮은 confidence는 모두 review-needed로 남 | finding이 0인데 분명 secret이 있을 것 같음 | 워킹트리에는 없고 과거 커밋에만 존재 | `include_history: true`로 다시 실행 | | 결과 파일이 매번 덮어써짐 | `scan`은 JSONL 백엔드에서 매 실행 시 store를 초기화 | 이력을 남기려면 DynamoDB-compatible 백엔드 사용, raw 값만 누적하려면 `--raw-evidence` | -## 주기 스캔 로컬 테스트 (Dynalite + scan-all) +## 주기 스캔 로컬 테스트 (DynamoDB Local + scan-all) `scan-all`은 카탈로그(DB)에 등록한 여러 저장소를 한 번에 스캔하는 명령으로, DynamoDB-compatible 백엔드가 필요합니다. 로컬에서는 저장소에 포함된 -`docker-compose.yml`의 Dynalite 컨테이너로 검증할 수 있습니다. +`docker-compose.yml`의 DynamoDB Local 컨테이너로 검증할 수 있습니다. -> 로컬 검증 전용입니다. Dynalite 데이터는 컨테이너를 내리면 사라집니다. +> 로컬 검증 전용입니다. DynamoDB Local 데이터는 named Compose volume에 유지됩니다. 사전 준비: `docker`(compose v2), `git`, `gitleaks`, `uv`, 그리고 대상에 맞는 `gh`(GitHub) / `glab`(GitLab) CLI. 사설 저장소는 `GH_TOKEN` / `GITLAB_TOKEN` 또는 `gh auth login`. @@ -226,27 +226,23 @@ curl -LsSf https://astral.sh/uv/install.sh | sh # https://github.com/gitleaks/gitleaks/releases # gh (GitHub CLI) # https://github.com/cli/cli/releases 또는 패키지 매니저(apt / brew) +# glab (GitLab CLI) +# https://gitlab.com/gitlab-org/cli/-/releases 또는 패키지 매니저 # docker + Compose v2 plugin은 호스트에 설치하고, 사용자를 docker 그룹에 추가(재로그인) ``` 그다음 이 저장소를 받아 의존성을 설치합니다. 이후 모든 명령은 저장소 루트에서 실행합니다. ```bash -gh repo clone source-security-dev/security-scanner +git clone https://github.com//.git security-scanner cd security-scanner -# 이 기능이 아직 main에 머지되지 않았다면(PR 리뷰 중) 기능 브랜치를 체크아웃하세요. -# 예) PR #8 브랜치: -# git fetch origin worktree-periodic-gitleaks-scan -# git checkout worktree-periodic-gitleaks-scan uv sync ``` -> main에 머지된 뒤에는 위 체크아웃 단계가 필요 없습니다. - -1. 로컬 DB 기동. `http://localhost:4567`에서 Dynalite가 응답합니다. +1. 로컬 DB 기동. `http://localhost:4567`에서 DynamoDB Local이 응답합니다. ```bash - docker compose up -d dynalite + docker compose up -d dynamodb-local ``` 2. 테이블 생성(최초 1회). 목록 조회용 보조 인덱스가 함께 만들어집니다. @@ -288,8 +284,97 @@ uv sync docker compose down ``` +### Incremental queue MVP 로컬 확인 + +Incremental queue MVP는 `discover-updates`가 commit 단위 `SCAN_JOB`을 만들고 +`scan-worker --once`가 bounded batch로 처리하는 흐름입니다. 아래 절차는 DynamoDB +Local 기반 로컬 검증 전용입니다. Production scheduler, managed DynamoDB, 실 저장소 +운영 절차는 이 문서 범위가 아닙니다. + +새 PC에서 가장 짧은 경로는 `quickstart`입니다. Public HTTPS repo는 `gh`/`glab` +인증이 없어도 `git clone` fallback을 사용합니다. Private repo는 먼저 `doctor +--private`로 인증 상태를 확인하세요. + +```bash +docker compose up -d dynamodb-local + +uv run security-scanner doctor \ + --target-url https://github.com// + +uv run security-scanner quickstart https://github.com// \ + --storage-backend dynamodb \ + --dynamodb-endpoint-url http://localhost:4567 \ + --dynamodb-table SecurityScannerLocal +``` + +Host의 `4567` 포트가 이미 사용 중이면 compose 실행 때만 host port를 바꿉니다. +Worker는 compose 내부의 `http://dynamodb-local:8000`을 사용하므로 별도 변경이 +필요 없습니다. + +```bash +SECURITY_SCANNER_DYNAMO_HOST_PORT=14567 docker compose up -d dynamodb-local +``` + +커스텀 GitLab 도메인은 URL host만으로 provider를 확정할 수 없습니다. 이때는 +`--scm-provider gitlab`을 명시합니다. + +```bash +uv run security-scanner quickstart https://source.example.test// \ + --scm-provider gitlab \ + --storage-backend dynamodb \ + --dynamodb-endpoint-url http://localhost:4567 \ + --dynamodb-table SecurityScannerLocal +``` + +Private repo는 token 값을 출력하지 않는 preflight를 먼저 돌립니다. + +```bash +uv run security-scanner doctor \ + --target-url https://github.com// \ + --private +``` + +수동으로 단계를 나누면 다음 순서입니다. + +```bash +uv run security-scanner init-storage \ + --storage-backend dynamodb \ + --dynamodb-endpoint-url http://localhost:4567 \ + --dynamodb-table SecurityScannerLocal + +uv run security-scanner add-target https://github.com// \ + --storage-backend dynamodb \ + --dynamodb-endpoint-url http://localhost:4567 \ + --dynamodb-table SecurityScannerLocal + +uv run security-scanner discover-updates --initialize \ + --storage-backend dynamodb \ + --dynamodb-endpoint-url http://localhost:4567 \ + --dynamodb-table SecurityScannerLocal +``` + +`quickstart`는 fresh DB에서도 현재 ref tip을 하나의 `SCAN_JOB`으로 enqueue한 뒤 +worker를 실행합니다. Compose로 같은 흐름을 한 번에 실행할 수도 있습니다. + +```bash +SECURITY_SCANNER_QUICKSTART_TARGET=https://github.com// \ + docker compose up --build --abort-on-container-exit --exit-code-from worker worker +``` + +커스텀 GitLab 도메인은 compose에서도 같은 provider hint를 env로 넘깁니다. + +```bash +SECURITY_SCANNER_QUICKSTART_TARGET=https://source.example.test// \ +SECURITY_SCANNER_SCM_PROVIDER=gitlab \ + docker compose up --build --abort-on-container-exit --exit-code-from worker worker +``` + +이 Compose worker도 로컬 검증 전용입니다. 이미지에는 `git`, `gh`, `glab`, +`gitleaks`가 포함되며, compose 파일에는 credentials가 포함되지 않습니다. + 플래그 대신 환경변수로도 지정할 수 있습니다: -`SECURITY_SCANNER_DYNAMO_ENDPOINT`, `SECURITY_SCANNER_DYNAMO_TABLE`. +`SECURITY_SCANNER_DYNAMO_ENDPOINT`, `SECURITY_SCANNER_DYNAMO_TABLE`, +`SECURITY_SCANNER_SCM_PROVIDER`. 우분투 호스트에 systemd로 주기 실행을 거는 방법은 [systemd 배포 가이드](../../deploy/systemd/README.md)를 참고합니다. diff --git a/docs/workbench/README.md b/docs/workbench/README.md index 2e078e4..b0dd4ea 100644 --- a/docs/workbench/README.md +++ b/docs/workbench/README.md @@ -9,6 +9,7 @@ product documentation. | `specs/` | Approved or draft implementation specs. | | `plans/` | Execution plans, task breakdowns, and verification checklists. | | `adrs/` | Architecture decision records. | +| `agentic-workflows/` | Goal-based agent implementation loops and review gates. | | `context/` | Current-state notes, review notes, and legacy source material. | Additional maintainer guidance: diff --git a/docs/workbench/adrs/ADR-20260612-incremental-scan-queue-worker.md b/docs/workbench/adrs/ADR-20260612-incremental-scan-queue-worker.md new file mode 100644 index 0000000..2ddf717 --- /dev/null +++ b/docs/workbench/adrs/ADR-20260612-incremental-scan-queue-worker.md @@ -0,0 +1,220 @@ +# ADR-20260612: Incremental Scan Queue Worker for Branch-Aware Secret Scanning + +**Status:** Proposed +**Date:** 2026-06-12 +**Deciders:** security-scanner maintainers +**Related Issue:** #12 + +## Context + +`scan-all` now provides a useful periodic batch path: it reads enabled `SCAN_TARGET` +rows, fetches each repository, builds an in-memory manifest, and reuses +`run_local_scan()`. That path is intentionally simple and should remain the full +baseline and fallback mode. + +The next scaling problem is different. A small number of target repositories can be +handled by repeated `scan-all`, but a larger catalog with many branches needs to avoid +rescanning the same commits every cycle. A repository may be large, have many refs, +and receive new commits on only a small subset of branches. The scanner should answer +"what changed since the last fetch?" and "has this commit already been scanned with +this scanner/rule/config version?" before spending Gitleaks runtime. + +Current constraints: + +- Phase 1 remains local secret detection on filesystem checkouts. +- Gitleaks remains the primary scanner. +- Existing `scan`, `scan-all`, `SCAN_TARGET`, `FINDING`, `FINDING_OBSERVATION`, + `FINDING_STATE`, and `SCAN_RUN` semantics must continue working. +- The local NoSQL backend is a single-table DynamoDB-compatible store with `GSI1` + and `GSI2` already reserved. +- The public repository must not contain real repo names, real findings, private + paths, real tokens, internal endpoints, or operational secrets. +- Webhook/on-demand fetch and managed queue services are intentionally out of the + MVP. + +Gitleaks supports `git` mode and `--log-opts`, which can constrain the underlying +`git log -p` traversal to a commit range. The project should pin and smoke-test the +Gitleaks version used by the MVP before relying on exact range semantics in runtime +proofs. + +## Decision + +Add an incremental scan path alongside `scan-all`. + +The MVP introduces four durable concepts in the existing NoSQL store: + +1. **`REF_STATE`** records the last observed SHA for each repository ref. +2. **`SCAN_JOB`** records one commit scan work item discovered from ref updates. + `commitRange` may be stored as discovery provenance, but MVP worker execution + and ledger writes are commit-level. +3. **`SCAN_LEDGER`** records completed scans for a commit under a scanner/rule/config + version tuple. +4. **`REPO_LEASE`** limits concurrent worker execution for the same repository + workspace. + +The MVP adds these CLI entry points: + +```text +security-scanner discover-updates --initialize +security-scanner discover-updates --enqueue +security-scanner scan-worker --once --max-jobs N +security-scanner queue-status +``` + +`discover-updates` owns fetch and ref-delta discovery. `scan-worker` owns queue +polling, leases, Gitleaks execution, finding persistence, and ledger writes. + +New incremental rows use a canonical repository identity: + +```text +repoId = "repo_" + sha256(normalized ScanTarget.url)[:24] +``` + +`repoUrl` stores the normalized URL and display names remain presentation +metadata. This avoids coupling queue identity to mutable target names while +leaving the existing `scan-all` behavior unchanged. + +`scan-all` remains: + +- the first-run/full-baseline path, +- a manual fallback when queue processing is unavailable, +- the periodic sanity sweep path after rule/config changes. + +The MVP is forward-only: it starts tracking new ref updates after +`discover-updates --initialize`. Historical branch fan-out aggregation remains a +later phase unless covered by an explicit full baseline run. + +## Options Considered + +### Option 1: Keep `scan-all` as the only periodic path + +| Dimension | Assessment | +| --- | --- | +| Complexity | Low | +| Operational fit | Good for small catalogs | +| Scale fit | Weak when runtime exceeds cadence | +| Failure isolation | Batch-level | + +**Pros** +- No new entity types or worker lifecycle. +- Reuses the current tested path. +- Easy to explain and operate. + +**Cons** +- Fetch and scan stay on one critical path. +- No backlog visibility. +- No commit-level skip. +- One long run can make the next scheduled run useless. + +### Option 2: Add worker parallelism inside `scan-all` + +| Dimension | Assessment | +| --- | --- | +| Complexity | Medium | +| Operational fit | Moderate | +| Scale fit | Partial | +| Failure isolation | Per target, but still one batch | + +**Pros** +- Keeps one command and one schedule. +- Can improve throughput for small-to-medium catalogs. + +**Cons** +- Still fetches and scans as one batch. +- Does not create durable backlog or retry semantics. +- Does not solve commit-level idempotency. +- Makes `scan-all` deeper and harder to reason about. + +### Option 3: Incremental queue worker with commit ledger (Selected) + +| Dimension | Assessment | +| --- | --- | +| Complexity | Medium-high | +| Operational fit | Strong for growing catalogs | +| Scale fit | Stronger incremental behavior | +| Failure isolation | Per job with leases and retry | + +**Pros** +- Separates discovery from execution. +- Makes backlog, retry, and worker progress observable. +- Skips already-scanned commits for the same scanner/rule/config tuple. +- Keeps `scan-all` simple and available as fallback. + +**Cons** +- Adds several entity types and lease semantics. +- Requires careful idempotency tests. +- Requires a bounded Gitleaks commit-scan proof. +- Introduces eventual consistency between ref state, jobs, ledger, and findings. + +### Option 4: External managed queue + +| Dimension | Assessment | +| --- | --- | +| Complexity | High for current phase | +| Operational fit | Premature | +| Scale fit | Strong later | +| Cost | Higher | + +**Pros** +- Stronger queue primitives, visibility, and scaling if the project moves to a + managed runtime later. + +**Cons** +- Violates current local-first MVP constraints. +- Adds cloud credentials and operational surface too early. +- Harder to keep public docs synthetic and portable. + +## Trade-off Analysis + +Option 3 is the smallest architecture that changes the scaling behavior instead of +only making the batch faster. The important shift is not "more threads"; it is a +durable feedback loop: + +```text +fetch refs -> derive commit jobs -> lease job -> scan commit -> write ledger -> skip next time +``` + +This accepts more state in exchange for lower repeated work and better agentic +implementation loops. Because `scan-all` remains unchanged, the risk is bounded: the +incremental path can mature without breaking the current batch path. + +The MVP deliberately avoids branch-level identity redesign as a first step. Branch +and commit metadata should be recorded on observations and job/ledger rows first. +Finding identity and cross-branch aggregation can be deepened after the queue path +is proven with synthetic repositories. + +## Consequences + +- New storage capabilities are needed beyond the current `FindingStore` interface. +- The NoSQL adapter becomes the first implementation of the queue/ledger store. +- JSONL storage remains unsupported for incremental queue execution in the MVP. +- `GitleaksRunner` needs a `ScanOptions.git_log_opts` or equivalent bounded git + log option. +- `Finding.repo.commit` can be populated for worker-produced findings. Branch/ref + metadata may initially live in scan job and observation metadata. +- `scan-all` should not grow worker semantics. It remains a simple baseline path. +- Docker Compose can later run several `scan-worker` containers, but the core + MVP first proves the queue/ledger loop with `--once`. Compose is a follow-up + runtime proof unless explicitly selected for the same milestone. + +## Implementation Notes + +- Use synthetic repositories in tests. Do not commit real findings or private repo + paths. +- Use deterministic job IDs and conditional writes for enqueue, job lease + acquisition, and repo lease acquisition. +- Do not enqueue patch blobs. Queue rows store metadata only. +- Do not mark a job completed until findings and the scan ledger have been + written in a retry-safe order. +- Treat `rule_pack_version` and `scanner_config_hash` as ledger invalidators. +- Use bounded leases. Expired leases are reclaimable. +- Prefer one worker process handling one repository workspace at a time. + +## Related Documents + +- Spec: `docs/workbench/specs/2026-06-12-incremental-scan-queue-worker-mvp.md` +- Agentic workflow: `docs/workbench/agentic-workflows/2026-06-12-incremental-scan-queue-worker-goal-plan.md` +- The agentic workflow is the implementation plan artifact for this issue; no + `docs/workbench/plans/` artifact is required. +- Prior ADR: `docs/workbench/adrs/ADR-20260531-periodic-multi-repo-scan-catalog.md` +- Issue: #12 diff --git a/docs/workbench/agentic-workflows/2026-06-12-incremental-scan-queue-worker-goal-plan.md b/docs/workbench/agentic-workflows/2026-06-12-incremental-scan-queue-worker-goal-plan.md new file mode 100644 index 0000000..81237cd --- /dev/null +++ b/docs/workbench/agentic-workflows/2026-06-12-incremental-scan-queue-worker-goal-plan.md @@ -0,0 +1,286 @@ +# Agentic Workflow: Incremental Scan Queue Worker MVP + +**Status:** Draft +**Date:** 2026-06-12 +**ADR:** `docs/workbench/adrs/ADR-20260612-incremental-scan-queue-worker.md` +**Spec:** `docs/workbench/specs/2026-06-12-incremental-scan-queue-worker-mvp.md` +**Issue:** #12 + +This is a Goal-based implementation plan. It intentionally does not live under +`docs/workbench/plans/`. For issue #12, this document is the implementation plan +artifact. + +## Operating Rules + +- Work in isolated `codex/` branches and dedicated worktrees. +- Keep `main`/detached checkout read-only. +- Preserve existing `scan`, `scan-all`, catalog, report, gate, evaluate, and verifier + behavior unless the goal explicitly changes them. +- Use `uv` for Python commands. +- Keep committed tests public-safe and synthetic. +- Do not commit private repo names, real findings, real paths, hostnames, endpoints, + credentials, or scan outputs. +- Every implementation loop ends with tests and a short reviewer-facing summary. +- If a loop changes architecture, update the ADR/spec in the same branch. + +## Goal Graph + +```text +G0: Incremental scan MVP accepted +| ++-- G1: Queue/ledger model exists +| +-- G1.1: dataclasses and item mappers +| +-- G1.2: NoSQL adapter operations +| +-- G1.3: idempotency and lease tests +| ++-- G2: Discovery command exists +| +-- G2.1: fetch/list refs adapter +| +-- G2.2: initialize mode writes REF_STATE only +| +-- G2.3: enqueue mode writes SCAN_JOB and advances REF_STATE +| ++-- G3: Worker command exists +| +-- G3.1: lease pending job +| +-- G3.2: acquire repo lease +| +-- G3.3: run bounded Gitleaks scan +| +-- G3.4: persist findings, ledger, completion +| ++-- G4: Operator visibility exists +| +-- G4.1: queue-status +| +-- G4.2: dead-letter and expired lease counts +| +Follow-up outside core G0: + +G5: Runtime packaging proof exists ++-- G5.1: Dockerfile or documented worker command ++-- G5.2: compose-compatible worker service ++-- G5.3: local synthetic smoke +``` + +## Agentic Loop Template + +Each loop uses the same pattern: + +```text +1. Goal + State the exact goal and acceptance checks. + +2. Read + Inspect the current code, tests, ADR/spec, and issue. + +3. Slice + Choose the smallest implementation slice that proves new behavior. + +4. Patch + Implement production code and tests together. + +5. Verify + Run targeted tests first, then broader tests if shared behavior changed. + +6. Review + Run an independent agent review for architecture, correctness, and public-safety. + +7. Reflect + Update ADR/spec only when the implementation changes the decision or contract. + +8. Gate + Merge only when acceptance checks are proven by current evidence. +``` + +## Loop 1: Queue/Ledger Storage Model + +**Goal:** add durable `REF_STATE`, `SCAN_JOB`, `SCAN_LEDGER`, and `REPO_LEASE` item +models plus NoSQL adapter operations. + +**Primary files:** + +- `src/security_scanner/storage/adapters/nosql_db/items.py` +- `src/security_scanner/storage/adapters/nosql_db/access.py` +- `src/security_scanner/storage/adapters/nosql_db/store.py` +- `tests/test_incremental_scan_storage.py` + +**Acceptance checks:** + +- item mapper round-trips for all four entity types. +- deterministic `repoId` and `jobId` behavior is covered. +- enqueue idempotency prevents duplicate active jobs for the same commit tuple. +- job lease acquisition is conditional. +- expired job lease can be reclaimed. +- repo lease acquisition is conditional. +- `complete_processed_job` writes findings, `SCAN_LEDGER`, and job completion in + a retry-safe order. + +**Suggested agent split:** + +- Worker A owns dataclasses and item mappers. +- Worker B owns store methods and fake table support. +- Reviewer C checks single-table key shape, idempotency, and test coverage. + +## Loop 2: Git Discovery and `discover-updates` + +**Goal:** add a discovery command that mutates only repository cache and queue +state; it does not run scanners or persist findings. + +**Primary files:** + +- `src/security_scanner/targets/fetcher.py` +- `src/security_scanner/runtime/incremental_discovery.py` +- `src/security_scanner/cli/app.py` +- `tests/test_incremental_discovery.py` +- `tests/test_cli_discover_updates.py` + +**Acceptance checks:** + +- `--initialize` writes ref state and enqueues zero jobs. +- `--enqueue` detects `old_sha..new_sha` and enqueues one job per new unscanned + commit. +- missing ref state in enqueue mode is treated as observation, not historical backfill. +- ledger rows skip enqueue. +- non-fast-forward refs are reported and do not advance `REF_STATE`. +- fetch failure is isolated per target. + +**Suggested agent split:** + +- Worker A owns git command adapter and unit tests. +- Worker B owns CLI command and store integration tests. +- Reviewer C checks edge cases: deleted ref, non-fast-forward, missing git binary. + +## Loop 3: Gitleaks Bounded Commit Scan Support + +**Goal:** allow the scanner runner to pass bounded `--log-opts` in git mode. + +**Primary files:** + +- `src/security_scanner/core/scan/options.py` +- `src/security_scanner/scanners/gitleaks/runner.py` +- `tests/test_gitleaks_runner.py` +- `tests/test_gitleaks_scanner.py` + +**Acceptance checks:** + +- `ScanOptions.git_log_opts` exists. +- `GitleaksRunner` emits `--log-opts` only for git mode. +- dir mode rejects or ignores `git_log_opts` explicitly. +- existing runner tests still pass. + +**Suggested agent split:** + +- Worker A owns runner/options changes. +- Reviewer B checks Gitleaks command compatibility against current official docs and + local tests. + +## Loop 4: `scan-worker --once` + +**Goal:** process queued jobs, persist findings, write ledger, and mark jobs completed. + +**Primary files:** + +- `src/security_scanner/runtime/scan_worker.py` +- `src/security_scanner/cli/app.py` +- `src/security_scanner/runtime/local_scan.py` +- `tests/test_scan_worker.py` +- `tests/test_cli_scan_worker.py` + +**Acceptance checks:** + +- empty queue exits `0`. +- one pending job is leased and completed. +- repo lease prevents duplicate processing. +- scanner failure returns the job to `pending`, increments attempts, and records + `lastError` plus `nextAttemptAt`. +- repo lease acquisition failure returns the job to `pending` without incrementing + attempts. +- attempts exhausted moves job to `dead_letter`. +- completed job writes ledger and prevents re-enqueue. +- crash-recovery tests cover findings-before-ledger and ledger-before-completed + boundaries. + +**Suggested agent split:** + +- Worker A owns runtime worker function. +- Worker B owns CLI wiring and output. +- Reviewer C checks failure modes and idempotency. + +## Loop 5: `queue-status` and Operator Proof + +**Goal:** expose backlog state and create a repeatable synthetic proof. + +**Primary files:** + +- `src/security_scanner/runtime/queue_status.py` +- `src/security_scanner/cli/app.py` +- `tests/test_queue_status.py` +- `tests/test_incremental_scan_smoke.py` +- `docs/views/getting-started.md` + +**Acceptance checks:** + +- status groups jobs by status. +- expired leases are counted. +- synthetic smoke proves initialize -> enqueue -> worker -> ledger skip. +- public docs mention queue MVP without real repo details. + +**Suggested agent split:** + +- Worker A owns status runtime and CLI. +- Worker B owns synthetic smoke and docs. +- Reviewer C checks public safety and operator clarity. + +## Follow-up Loop 6: Container Runtime MVP + +**Goal:** make the worker runnable through Docker Compose without calling it production. + +**Primary files:** + +- `Dockerfile` +- `docker-compose.yml` +- `deploy/systemd/README.md` +- `docs/views/getting-started.md` + +**Acceptance checks:** + +- existing Dynalite service remains. +- worker command can run `scan-worker --once`. +- docs clearly say Compose is local verification only. +- no credentials are committed. + +**Suggested agent split:** + +- Worker A owns Dockerfile/Compose changes. +- Reviewer B checks secret handling, volume paths, and public-safe docs. + +## Review Gates + +Run these review roles before merging each loop: + +1. **Architecture reviewer** + - Does the implementation preserve `scan-all` as fallback? + - Are new interfaces deep enough, or did queue logic leak into CLI? + - Are storage keys queryable for the acceptance checks? + +2. **Correctness reviewer** + - Are leases conditional? + - Can a crash cause permanent stuck work? + - Are retries idempotent? + - Can ledger writes be repeated safely? + +3. **Public-safety reviewer** + - Are all fixtures synthetic? + - Are logs/docs free of real repo names, endpoints, tokens, and findings? + - Are examples safe to publish? + +## Completion Definition + +G0 is complete only when: + +- `discover-updates --initialize` is proven with a synthetic repo. +- `discover-updates --enqueue` creates at least one durable job for a new synthetic + commit. +- `scan-worker --once` processes that job and writes findings plus `SCAN_LEDGER`. +- repeating enqueue after completion creates zero duplicate jobs. +- two-worker lease behavior is covered by tests. +- `queue-status` reports meaningful counts. +- `scan-all` tests still pass. +- docs and examples remain public-safe. + +G5 container runtime proof is a follow-up goal unless explicitly selected for the +same milestone. diff --git a/docs/workbench/specs/2026-06-12-incremental-scan-queue-worker-mvp.md b/docs/workbench/specs/2026-06-12-incremental-scan-queue-worker-mvp.md new file mode 100644 index 0000000..6436d01 --- /dev/null +++ b/docs/workbench/specs/2026-06-12-incremental-scan-queue-worker-mvp.md @@ -0,0 +1,540 @@ +# Spec: Incremental Scan Queue Worker MVP + +**Status:** Draft +**Date:** 2026-06-12 +**ADR:** `docs/workbench/adrs/ADR-20260612-incremental-scan-queue-worker.md` +**Issue:** #12 + +## 1. Scope + +This spec defines the first incremental scan path for `security-scanner`. + +The MVP must: + +1. discover ref updates for enabled `SCAN_TARGET` rows, +2. enqueue one commit scan job per newly observed unscanned commit, +3. let workers lease and process jobs, +4. persist findings through the existing scan result path where possible, +5. write commit-level scan ledger rows, and +6. avoid enqueueing or processing the same commit twice for the same + scanner/rule/config tuple. + +The MVP must not remove or rewrite `scan-all`. + +## 2. Non-goals + +- Webhook or push-event ingestion. +- Managed queue service. +- Autoscaling policy. +- Stale branch pruning policy. +- Full cross-branch finding identity redesign. +- GitLab owner sync. +- JSONL backend support for queue/ledger execution. +- Real private repository fixtures or real findings in committed tests. + +## 3. Glossary + +| Term | Meaning | +| --- | --- | +| Target | Enabled `SCAN_TARGET` row. | +| Ref | Git ref selected for incremental scanning, usually `refs/remotes/origin/`. | +| Ref state | Last observed SHA for a target/ref pair. | +| Scan job | Durable work item for one commit. | +| Scan ledger | Durable record that a commit was scanned with a scanner/rule/config tuple. | +| Repo lease | Short-lived lock preventing two workers from mutating the same repo workspace. | +| Worker | CLI process/container that polls and processes scan jobs. | + +## 4. High-level Flow + +```text +operator/systemd + -> security-scanner discover-updates --initialize + -> list enabled SCAN_TARGET + -> fetch repo cache + -> list refs + -> write REF_STATE only + +operator/systemd + -> security-scanner discover-updates --enqueue + -> list enabled SCAN_TARGET + -> fetch repo cache + -> compare refs with REF_STATE + -> derive new commits + -> skip commits already present in SCAN_LEDGER + -> enqueue one SCAN_JOB per unscanned commit + -> advance REF_STATE after enqueue succeeds + +worker loop + -> security-scanner scan-worker --once --max-jobs N + -> lease pending SCAN_JOB + -> acquire REPO_LEASE + -> run bounded single-commit Gitleaks scan + -> persist findings + -> write SCAN_LEDGER + -> mark SCAN_JOB completed +``` + +## 5. CLI Contracts + +### 5.1 `discover-updates` + +```text +security-scanner discover-updates [--initialize | --enqueue] + [--max-targets N] + [--ref-pattern PATTERN ...] + [--storage-backend dynamodb] + [--dynamodb-endpoint-url URL] + [--dynamodb-table NAME] +``` + +Rules: + +- Exactly one of `--initialize` or `--enqueue` is required. +- Only enabled `SCAN_TARGET` rows are considered. +- `--initialize` fetches and writes `REF_STATE` but enqueues no jobs. +- `--enqueue` fetches, compares refs, enqueues jobs for new commits, and updates + `REF_STATE`. +- Fetch failures are isolated per target and return exit `2` if any target failed. +- Catalog or storage fatal errors return exit `1`. +- Success returns exit `0`. + +Output summary: + +```text +targets: +fetch ok: +fetch failed: +refs observed: +jobs enqueued: +ledger skipped: +skipped non-fast-forward: +``` + +### 5.2 `scan-worker` + +```text +security-scanner scan-worker + [--once] + [--max-jobs N] + [--lease-seconds N] + [--worker-id ID] + [--storage-backend dynamodb] + [--dynamodb-endpoint-url URL] + [--dynamodb-table NAME] +``` + +Rules: + +- MVP supports `--once`; long-running mode can be a later extension. +- `--max-jobs` defaults to `1`. +- Worker leases one pending job at a time. +- Worker must acquire the repository lease before touching a checkout. +- If repository lease acquisition fails, the worker returns the job to `pending` + without incrementing `attempts` and writes no findings or ledger row. +- If no job is available, exit `0`. +- If any processed job fails permanently in this invocation, exit `2`. +- Fatal storage/runtime error returns exit `1`. + +Output summary: + +```text +leased: +completed: +retryable: +dead-lettered: +``` + +### 5.3 `queue-status` + +```text +security-scanner queue-status + [--storage-backend dynamodb] + [--dynamodb-endpoint-url URL] + [--dynamodb-table NAME] +``` + +Returns counts grouped by `SCAN_JOB.status` and expired lease count. + +## 6. Data Model + +All rows live in the existing single-table NoSQL store. + +### 6.0 Repository Identity + +Incremental queue rows use a canonical repository identity derived from the +enabled `ScanTarget.url`: + +```text +repoId = "repo_" + sha256(normalized ScanTarget.url)[:24] +repoUrl = +repoDisplayName = +``` + +Normalization must remove non-semantic differences that would otherwise produce +duplicate queue identity, such as trailing slashes. It must not expand or store +private host aliases in committed examples. + +All new `REF_STATE`, `SCAN_JOB`, `SCAN_LEDGER`, and `REPO_LEASE` rows use +`repoId`. Existing `scan-all` can keep current target naming behavior until the +incremental path is migrated into shared reporting. + +### 6.1 `REF_STATE` + +Purpose: remember last observed SHA for a target/ref. + +```text +PK = REPO# +SK = REF# +entityType = REF_STATE +gsi1pk = REF_STATE#ALL +gsi1sk = ## +repoId +repoUrl +refName +lastSeenSha +updatedAt +``` + +### 6.2 `SCAN_JOB` + +Purpose: durable queue item for one commit scan. `commitRange` is optional +discovery provenance and must not change the one-job-to-one-ledger-row +contract. + +```text +PK = SCAN_JOB# +SK = META +entityType = SCAN_JOB +gsi1pk = SCAN_JOB_STATUS# +gsi1sk = ### +gsi2pk = REPO# +gsi2sk = JOB### +jobId +repoId +repoUrl +refName +oldSha +newSha +commitSha +commitRange # provenance only, for example oldSha..newSha +scannerName +scannerVersion +rulePackVersion +scannerConfigHash +priority +status +attempts +maxAttempts +workerId +leaseUntil +nextAttemptAt +createdAt +updatedAt +lastError +``` + +Allowed statuses: + +| Status | Meaning | +| --- | --- | +| `pending` | Ready to lease. | +| `leased` | Worker owns it until `leaseUntil`. | +| `completed` | Findings and ledger write finished. | +| `dead_letter` | Attempts exhausted or invalid job. | + +Retryable failures are represented as `pending` rows with incremented +`attempts`, `nextAttemptAt`, and `lastError`. `dead_letter` is the only terminal +failure state. `lease_next_scan_job` only leases `pending` jobs whose +`nextAttemptAt <= now`. + +MVP deterministic job ID and dedupe key: + +```text +jobId = "scan_job_" + sha256( + repoId + "\0" + + commitSha + "\0" + + scannerName + "\0" + + scannerVersion + "\0" + + rulePackVersion + "\0" + + scannerConfigHash +)[:32] + +dedupeKey = repoId + commitSha + scannerName + scannerVersion + rulePackVersion + scannerConfigHash +``` + +If a matching `SCAN_LEDGER` row exists, `discover-updates --enqueue` does not create +a job for that commit. If the deterministic `SCAN_JOB` row already exists, +enqueue is a clean no-op. + +### 6.3 `SCAN_LEDGER` + +Purpose: prove one commit was scanned under a scanner/rule/config tuple. + +```text +PK = SCAN_LEDGER## +SK = ### +entityType = SCAN_LEDGER +gsi1pk = REPO# +gsi1sk = LEDGER### +repoId +commitSha +scannerName +scannerVersion +rulePackVersion +scannerConfigHash +scanRunId +jobId +scannedAt +findingCount +``` + +### 6.4 `REPO_LEASE` + +Purpose: prevent concurrent mutation of one local repository cache/workspace. + +```text +PK = REPO_LEASE# +SK = META +entityType = REPO_LEASE +gsi1pk = REPO_LEASE#ALL +gsi1sk = # +repoId +workerId +leaseUntil +updatedAt +``` + +Lease acquisition succeeds only when the row is absent or `leaseUntil` is in the +past. Releasing a lease is best-effort and only the owning `workerId` may release it. + +## 7. Storage Interface + +Add a queue/ledger storage capability rather than expanding the generic +`FindingStore` protocol. The runtime-facing interface should expose durable +domain operations, not low-level conditional-write ordering. + +Candidate protocol: + +```python +class IncrementalScanStore(Protocol): + def list_scan_targets(self) -> list[ScanTarget]: ... + def get_ref_state(self, repo_id: str, ref_name: str) -> RefState | None: ... + def put_ref_state(self, state: RefState) -> None: ... + def has_scan_ledger(self, key: ScanLedgerKey) -> bool: ... + def enqueue_commit_scan_job(self, job: ScanJob) -> bool: ... + def lease_next_scan_job( + self, + worker_id: str, + lease_seconds: int, + now: datetime, + ) -> ScanJob | None: ... + def complete_processed_job( + self, + job: ScanJob, + findings: Sequence[Finding], + ledger: ScanLedgerEntry, + ) -> None: ... + def record_retryable_failure( + self, + job_id: str, + error: str, + next_attempt_at: datetime, + ) -> None: ... + def move_job_to_dead_letter(self, job_id: str, error: str) -> None: ... + def return_job_to_pending(self, job_id: str, reason: str) -> None: ... + def acquire_repo_lease(self, repo_id: str, worker_id: str, lease_seconds: int) -> bool: ... + def release_repo_lease(self, repo_id: str, worker_id: str) -> None: ... + def get_queue_status(self, now: datetime) -> QueueStatus: ... +``` + +`complete_processed_job` is the critical crash-recovery seam. Its adapter +implementation owns the ordered, idempotent write sequence described in +[11. Idempotency and Retry](#11-idempotency-and-retry). + +The NoSQL adapter is the MVP implementation. JSONL should raise a clear unsupported +error if selected for queue commands. + +## 8. Git Discovery Rules + +MVP discovery can use the existing checkout cache path, but the design should not +depend on the currently checked-out branch. Runtime code calls a `GitDiscovery` +adapter so tests can use a fake adapter and CLI code does not shell out directly. + +Required git operations: + +```text +git -C fetch --all --prune +git -C for-each-ref --format=%(refname) %(objectname) refs/remotes/origin +git -C rev-list .. +git -C merge-base --is-ancestor +``` + +Candidate adapter: + +```python +class GitDiscovery(Protocol): + def fetch(self, repo_path: Path) -> None: ... + def list_remote_refs(self, repo_path: Path, patterns: Sequence[str]) -> list[GitRef]: ... + def is_ancestor(self, repo_path: Path, old_sha: str, new_sha: str) -> bool: ... + def list_new_commits(self, repo_path: Path, old_sha: str, new_sha: str) -> list[str]: ... +``` + +Rules: + +- Only remote refs under configured patterns are scanned. +- If no `REF_STATE` exists for a ref in `--enqueue`, treat it like initial + observation and write state without enqueueing historical jobs unless a later + flag explicitly enables backfill. +- If a ref is deleted, mark or omit it in later status. MVP does not prune ledger. +- If `old_sha` is not an ancestor of `new_sha`, do not enqueue unbounded history + and do not advance `REF_STATE`. Report `skipped_non_fast_forward` and return + exit `2`. An operator can run `scan-all` plus an explicit initialize/reset + step later. + +## 9. Gitleaks Bounded Commit Scan + +Add a scan option that lets the worker call Gitleaks with a bounded git log +selector. The MVP worker scans one commit per job. + +Candidate: + +```python +ScanOptions( + include_history=True, + git_log_opts="^!", +) +``` + +`GitleaksRunner.build_command()` adds: + +```text +--log-opts +``` + +only in `git` mode. + +For MVP commit jobs, use: + +```text +--log-opts "^!" +``` + +Range scanning such as `--log-opts ".."` is deferred until the +ledger model can represent multi-commit job completion without ambiguity. + +The exact Gitleaks version must be pinned or smoke-tested before runtime proof. The +MVP test suite should use a fake runner for unit tests and one synthetic integration +test where Gitleaks is available. + +## 10. Finding and Observation Metadata + +Worker-produced findings should include: + +- `repo.commit = commitSha` when the scan job is commit-specific. +- `gitleaks.commit` when emitted by Gitleaks. +- `scan.scan_run_id = "scan_run_" + jobId` or another deterministic value derived + from the job ID. + +MVP may keep branch/ref metadata on `SCAN_JOB` and `SCAN_LEDGER` only. A later phase +can add observation-level ref metadata if report/gate needs branch fan-out directly. + +## 11. Idempotency and Retry + +Idempotency requirements: + +- `discover-updates --enqueue` must not enqueue a job if ledger says the commit is + already scanned for the same scanner/rule/config tuple. +- Enqueue must use the deterministic `jobId` formula from `SCAN_JOB` and a + conditional write to avoid duplicate jobs for the same tuple. +- `scan-worker` must use conditional updates to lease a pending/expired job. +- `complete_processed_job` must write findings, write `SCAN_LEDGER`, then mark + the job `completed`. Ledger-before-findings is forbidden. +- If the process crashes after findings are written but before ledger, retry + rewrites the same deterministic observations and then writes ledger. +- If the process crashes after ledger is written but before the job is marked + `completed`, the next worker detects ledger and marks the job `completed` + without rerunning Gitleaks. +- Completed jobs write exactly one `SCAN_LEDGER` row for their `commitSha`. + +Retry rules: + +- `attempts` increments on each failed processing attempt. +- Failure below `maxAttempts` returns job to `pending` with `nextAttemptAt` and + `lastError`. +- Attempts exhausted moves to `dead_letter`. +- Expired `leased` jobs are eligible for lease by another worker. +- Repository lease acquisition failure returns the job to `pending` without + incrementing `attempts`. + +## 12. Runtime and Docker Follow-up + +The core MVP can run via CLI: + +```bash +uv run security-scanner discover-updates --initialize +uv run security-scanner discover-updates --enqueue +uv run security-scanner scan-worker --once --max-jobs 10 +``` + +Docker Compose is a follow-up runtime proof after the queue/ledger loop passes +synthetic tests: + +```text +services: + dynalite: + ... + scanner-worker: + build: . + command: ["uv", "run", "security-scanner", "scan-worker", "--once", "--max-jobs", "10"] +``` + +Do not remove the existing Dynalite service. Do not treat Compose as production. +Do not block the core queue MVP on Compose unless the milestone explicitly +selects runtime packaging as part of the same goal. + +## 13. Tests + +Minimum test set: + +- repo identity tests for deterministic `repoId` normalization. +- item mapper tests for `REF_STATE`, `SCAN_JOB`, `SCAN_LEDGER`, `REPO_LEASE`. +- storage tests for: + - enqueue idempotency, + - pending job lease, + - expired lease reclaim, + - completed job ledger write, + - retryable failure returns to `pending`, + - ledger-present leased job is completed without rerunning scanner, + - repo lease acquire/release. +- CLI tests for: + - `discover-updates --initialize` creates ref state but no job, + - `discover-updates --enqueue` creates jobs for new commits, + - non-fast-forward refs are skipped without advancing `REF_STATE`, + - ledger skip, + - `scan-worker --once` no-op with empty queue, + - repo lease acquisition failure returns job to `pending`, + - `queue-status` grouped counts. +- Git discovery adapter tests with a fake adapter. +- Gitleaks runner unit test for `--log-opts`. +- synthetic integration test: + - create local git repo, + - initialize ref state, + - add fake secret commit, + - enqueue one job, + - process one job with fake or real Gitleaks runner, + - assert ledger prevents a second enqueue. + +## 14. Acceptance Criteria + +- Existing `scan-all` tests still pass. +- New queue commands reject unsupported storage backends clearly. +- `discover-updates --initialize` is a safe no-job baseline. +- `discover-updates --enqueue` is idempotent against ledger. +- Two workers cannot lease the same job at the same time. +- Two workers cannot process the same repo workspace at the same time. +- Completed jobs write `SCAN_LEDGER`. +- Re-running discovery after completion does not enqueue duplicate work. +- `queue-status` reports status-grouped job counts and expired repo lease counts. +- Worker completion is retry-safe across findings/ledger/job-completion crash + boundaries. +- Tests use only synthetic repositories and fake secrets. diff --git a/src/security_scanner/cli/app.py b/src/security_scanner/cli/app.py index 9a9ffbc..32336f6 100644 --- a/src/security_scanner/cli/app.py +++ b/src/security_scanner/cli/app.py @@ -10,18 +10,14 @@ from __future__ import annotations import argparse -import contextlib -import datetime as _dt -import fcntl import os import sys from pathlib import Path from security_scanner.catalog.scan_target import ScanTarget -from security_scanner.core.scan.options import ScanOptions -from security_scanner.targets.fetcher import FetchError, fetch_or_clone +from security_scanner.targets.fetcher import fetch_or_clone from security_scanner.targets.repo_lister import list_owner_repos, SyncError -from security_scanner.targets.manifest import Manifest, ManifestError, Target +from security_scanner.targets.manifest import ManifestError from security_scanner.targets.url_normalize import normalize_url from security_scanner.baseline.ghas_csv import ( compare_with_findings, @@ -37,19 +33,47 @@ render_evaluation_report, render_verifier_delta_report, ) +from security_scanner.runtime.doctor import render_doctor_result, run_doctor from security_scanner.runtime.finding_query import FindingQueryRequest, read_findings +from security_scanner.runtime.incremental_discovery import ( + DEFAULT_REF_PATTERNS, + DISCOVERY_MODE_ENQUEUE, + DISCOVERY_MODE_INITIALIZE, + DiscoveryScannerConfig, + IncrementalDiscoveryRequest, + IncrementalDiscoverySummary, + SubprocessGitDiscovery, + run_incremental_discovery, +) from security_scanner.runtime.local_scan import ( LocalScanRequest, RULE_PACK_VERSION, run_local_scan, ) -from security_scanner.runtime.notification_log import ( - DEFAULT_NOTIFICATION_LOG_PATH, - fatal_error_record, - finding_record, - lock_contention_record, - summary_record, - write_record, +from security_scanner.runtime.notification_log import DEFAULT_NOTIFICATION_LOG_PATH +from security_scanner.runtime.queue_status import ( + QueueStatusRequest, + read_queue_status, + render_queue_status, +) +from security_scanner.runtime.quickstart import ( + QuickstartRequest, + QuickstartSummary, + render_quickstart_summary, + run_quickstart, +) +from security_scanner.runtime.scan_all import ( + DEFAULT_SCAN_ALL_LOCK_PATH, + ScanAllRequest, + ScanAllResult, + run_scan_all, + utc_now_iso, +) +from security_scanner.runtime.scan_worker import ( + ScanWorkerRequest, + ScanWorkerSummary, + make_default_scanner, + run_scan_worker_once, ) from security_scanner.runtime.verify_artifact import ( VerifierConfigRequest, @@ -67,6 +91,11 @@ from security_scanner.storage.base import StorageBootstrap +DISCOVERY_SCANNER_NAME = "gitleaks" +DISCOVERY_SCANNER_VERSION = "unknown" +DISCOVERY_SCANNER_CONFIG_HASH = "default" + + def _dynamodb_config_from_args(args: argparse.Namespace) -> DynamoDbCompatibleConfig: env_config = DynamoDbCompatibleConfig.from_env() return DynamoDbCompatibleConfig( @@ -153,6 +182,36 @@ def _add_storage_args(parser: argparse.ArgumentParser, *, include_jsonl_path: st ) +def _add_incremental_storage_args(parser: argparse.ArgumentParser) -> None: + parser.add_argument( + "--storage-backend", + choices=["jsonl", "dynamodb"], + default="dynamodb", + help="Queue storage backend (default: dynamodb; jsonl is unsupported).", + ) + parser.add_argument( + "--dynamodb-endpoint-url", + metavar="URL", + default=None, + help="DynamoDB-compatible endpoint URL " + "(default: SECURITY_SCANNER_DYNAMO_ENDPOINT or http://localhost:4567).", + ) + parser.add_argument( + "--dynamodb-table", + metavar="NAME", + default=None, + help="DynamoDB-compatible table name " + "(default: SECURITY_SCANNER_DYNAMO_TABLE or SecurityScannerLocal).", + ) + parser.add_argument( + "--dynamodb-region", + metavar="REGION", + default=None, + help="DynamoDB-compatible region " + "(default: SECURITY_SCANNER_AWS_REGION or us-west-2).", + ) + + def _add_query_args(parser: argparse.ArgumentParser) -> None: parser.add_argument( "--scan-run-id", @@ -296,6 +355,196 @@ def cmd_init_storage(args: argparse.Namespace) -> int: return 0 +def cmd_discover_updates(args: argparse.Namespace) -> int: + """Discover changed refs and optionally enqueue incremental scan jobs.""" + if args.storage_backend != "dynamodb": + print( + "error: discover-updates supports --storage-backend dynamodb only", + file=sys.stderr, + ) + return 2 + + try: + store = _store_from_args(args) + summary = run_incremental_discovery( + IncrementalDiscoveryRequest( + mode=( + DISCOVERY_MODE_INITIALIZE + if args.initialize + else DISCOVERY_MODE_ENQUEUE + ), + store=store, + fetch_repo=fetch_or_clone, + git=SubprocessGitDiscovery(), + scanner=DiscoveryScannerConfig( + scanner_name=DISCOVERY_SCANNER_NAME, + scanner_version=DISCOVERY_SCANNER_VERSION, + rule_pack_version=RULE_PACK_VERSION, + scanner_config_hash=DISCOVERY_SCANNER_CONFIG_HASH, + ), + max_targets=args.max_targets, + ref_patterns=tuple(args.ref_pattern or DEFAULT_REF_PATTERNS), + ) + ) + except Exception as exc: # noqa: BLE001 - fatal catalog/storage/runtime error. + print(f"error: discovery failed: {exc}", file=sys.stderr) + return 1 + + _render_discovery_summary(summary) + return 2 if summary.has_partial_failure else 0 + + +def _render_discovery_summary(summary: IncrementalDiscoverySummary) -> None: + print(f"targets: {summary.targets}") + print(f"fetch ok: {summary.fetch_ok}") + print(f"fetch failed: {summary.fetch_failed_count}") + for failure in summary.fetch_failed: + print(f" fetch failed: {failure.target.url} ({failure.error})") + print(f"refs observed: {summary.refs_observed}") + print(f"jobs enqueued: {summary.jobs_enqueued}") + print(f"ledger skipped: {summary.ledger_skipped}") + print(f"skipped non-fast-forward: {summary.skipped_non_fast_forward}") + + +def cmd_scan_worker(args: argparse.Namespace) -> int: + """Process queued incremental scan jobs once.""" + if not args.once: + print("error: scan-worker MVP requires --once", file=sys.stderr) + return 2 + if args.storage_backend != "dynamodb": + print( + "error: scan-worker supports --storage-backend dynamodb only", + file=sys.stderr, + ) + return 2 + + try: + store = _store_from_args(args) + summary = run_scan_worker_once( + ScanWorkerRequest( + store=store, + fetch_repo=fetch_or_clone, + scanner=make_default_scanner(), + max_jobs=args.max_jobs, + lease_seconds=args.lease_seconds, + worker_id=args.worker_id, + ) + ) + except Exception as exc: # noqa: BLE001 - fatal storage/runtime error. + print(f"error: scan-worker failed: {exc}", file=sys.stderr) + return 1 + + _render_scan_worker_summary(summary) + return 2 if summary.has_permanent_failure else 0 + + +def _render_scan_worker_summary(summary: ScanWorkerSummary) -> None: + print(f"leased: {summary.leased}") + print(f"completed: {summary.completed}") + print(f"retryable: {summary.retryable}") + print(f"dead-lettered: {summary.dead_lettered}") + + +def cmd_queue_status(args: argparse.Namespace) -> int: + """Read incremental queue status counts.""" + if args.storage_backend != "dynamodb": + print( + "error: queue-status supports --storage-backend dynamodb only", + file=sys.stderr, + ) + return 2 + + try: + store = _store_from_args(args) + status = read_queue_status(QueueStatusRequest(store=store)) + except Exception as exc: # noqa: BLE001 - fatal storage/runtime error. + print(f"error: queue-status failed: {exc}", file=sys.stderr) + return 1 + + print(render_queue_status(status), end="") + return 0 + + +def cmd_doctor(args: argparse.Namespace) -> int: + """Check local runtime dependencies and optional private SCM auth.""" + result = run_doctor( + target_url=args.target_url, + private=args.private, + scm_provider=args.scm_provider, + ) + print(render_doctor_result(result), end="") + return 0 if result.ok else 1 + + +def cmd_quickstart(args: argparse.Namespace) -> int: + """Bootstrap a target and process the current commit through the queue.""" + if args.storage_backend != "dynamodb": + print( + "error: quickstart supports --storage-backend dynamodb only", + file=sys.stderr, + ) + return 2 + + doctor_result = run_doctor( + target_url=args.url, + private=args.private, + scm_provider=args.scm_provider, + ) + if not doctor_result.ok: + print(render_doctor_result(doctor_result), end="", file=sys.stderr) + return 1 + + try: + store = _store_from_args(args) + + def quickstart_fetch(url: str) -> Path: + return fetch_or_clone( + url, + allow_git_fallback=not args.private, + scm_provider=args.scm_provider, + ) + + def worker_request_factory(max_jobs: int) -> ScanWorkerRequest: + return ScanWorkerRequest( + store=store, + fetch_repo=quickstart_fetch, + scanner=make_default_scanner(), + max_jobs=max_jobs, + lease_seconds=args.lease_seconds, + worker_id=args.worker_id, + ) + + summary = run_quickstart( + QuickstartRequest( + target_url=args.url, + target_name=args.name, + store=store, + fetch_repo=quickstart_fetch, + git=SubprocessGitDiscovery(), + scanner_config=DiscoveryScannerConfig( + scanner_name=DISCOVERY_SCANNER_NAME, + scanner_version=DISCOVERY_SCANNER_VERSION, + rule_pack_version=RULE_PACK_VERSION, + scanner_config_hash=DISCOVERY_SCANNER_CONFIG_HASH, + ), + worker_request_factory=worker_request_factory, + run_worker=not args.no_worker, + max_jobs=args.max_jobs, + storage_wait_seconds=args.storage_wait_seconds, + ) + ) + except Exception as exc: # noqa: BLE001 - quickstart should surface actionable failure. + print(f"error: quickstart failed: {exc}", file=sys.stderr) + return 1 + + _render_quickstart_summary(summary) + return 0 + + +def _render_quickstart_summary(summary: QuickstartSummary) -> None: + print(render_quickstart_summary(summary), end="") + + def _default_target_name(url: str) -> str: """Derive a default catalog name from a normalized URL's last two path segments.""" path = url.rsplit("://", 1)[-1] @@ -544,244 +793,62 @@ def cmd_sync(args: argparse.Namespace) -> int: # --------------------------------------------------------------------------- -SCAN_ALL_LOCK_PATH: Path = Path.home() / ".cache" / "security-scanner" / ".scan-all.lock" +SCAN_ALL_LOCK_PATH: Path = DEFAULT_SCAN_ALL_LOCK_PATH -class _LockContention(RuntimeError): - """Raised when the scan-all lock is held by another process.""" - - -@contextlib.contextmanager -def _scan_all_lock(path: Path): - """Acquire a non-blocking flock on ``path``. Releases on exit. - - Raises ``_LockContention`` if another process holds the lock. - """ - path.parent.mkdir(parents=True, exist_ok=True) - fd = open(path, "w") - try: - try: - fcntl.flock(fd.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) - except BlockingIOError as exc: - fd.close() - raise _LockContention() from exc - try: - yield - finally: - try: - fcntl.flock(fd.fileno(), fcntl.LOCK_UN) - except OSError: - # best-effort unlock; fd close가 어차피 락을 해제한다 - pass - finally: - if not fd.closed: - fd.close() - - -def _utc_now_iso() -> str: - """ISO-8601 UTC timestamp without microseconds. Module-level for monkeypatch.""" - return _dt.datetime.now(_dt.timezone.utc).replace(microsecond=0).isoformat() - - -def cmd_scan_all(args: argparse.Namespace, *, now_factory=_utc_now_iso) -> int: +def cmd_scan_all(args: argparse.Namespace, *, now_factory=utc_now_iso) -> int: """Scan every registered SCAN_TARGET. See spec §3 and notification-log spec.""" - log_path: Path = Path(args.notification_log) - started_at = now_factory() - - # 1. Acquire concurrency lock. - try: - lock_cm = _scan_all_lock(SCAN_ALL_LOCK_PATH) - lock_cm.__enter__() - except _LockContention: - print("another scan-all is running", file=sys.stderr) - event_at = now_factory() - write_record( - log_path, - lock_contention_record( - event_at=event_at, - lock_path=str(SCAN_ALL_LOCK_PATH), - ), - ) - return 3 - except OSError as exc: - print(f"error: failed to acquire lock: {exc}", file=sys.stderr) - write_record( - log_path, - fatal_error_record( - event_at=now_factory(), - scan_run_id=None, - error=f"Lock acquisition failed: {exc}", - stage="lock_acquisition", - ), - ) - return 1 - - try: - # 2. Catalog lookup. Failures here are fatal (exit 1). - store = _store_from_args(args) - try: - targets = store.list_scan_targets() - except Exception as exc: # noqa: BLE001 — spec §8 says fatal -> exit 1 - print(f"error: catalog lookup failed: {exc}", file=sys.stderr) - write_record( - log_path, - fatal_error_record( - event_at=now_factory(), - scan_run_id=None, - error=str(exc), - stage="catalog_lookup", - ), - ) - return 1 - - # Only enabled targets are scanned; disabled rows are reported, not fetched. - all_targets = targets - targets = [t for t in all_targets if t.enabled] - skipped_disabled = [t.url for t in all_targets if not t.enabled] - - # 3. No enabled targets short-circuit. - if not targets: - print("no targets to scan") - completed_at = now_factory() - write_record( - log_path, - summary_record( - event_at=completed_at, - scan_run_id=None, - started_at=started_at, - completed_at=completed_at, - exit_code=0, - registered_count=len(all_targets), - fetched_ok=[], - fetch_failed=[], - scanned_ok=[], - scanned_failed=[], - findings_total=0, - skipped_disabled=skipped_disabled, - ), - ) - return 0 - - # 4. Fetch each repo. Per-repo failures are isolated (spec §3.2). - fetched: list[tuple[ScanTarget, Path]] = [] - fetch_failed: list[tuple[ScanTarget, str]] = [] - for target in targets: - try: - cache_path = fetch_or_clone(target.url) - except FetchError as exc: - fetch_failed.append((target, str(exc))) - continue - except Exception as exc: # noqa: BLE001 - # Spec §3.2: one repo's fetch failure must not block the rest. - # Treat any unexpected per-repo error as a fetch failure. - fetch_failed.append((target, f"unexpected error: {exc}")) - continue - fetched.append((target, cache_path)) - - # 5. Build in-memory manifest and invoke run_local_scan. - scan_failure_count = 0 - result = None - if fetched: - manifest = Manifest( - version=1, - targets=[ - Target(name=t.name, path=str(cache), enabled=True) - for t, cache in fetched - ], - scan=ScanOptions(), - gitleaks_config=None, - ) - destination = ( + result = run_scan_all( + ScanAllRequest( + store_factory=lambda: _store_from_args(args), + storage_backend=args.storage_backend, + output_destination=( args.output if args.storage_backend == "jsonl" else args.storage_backend - ) - request = LocalScanRequest( - manifest_path="", - output_destination=destination, - storage_backend=args.storage_backend, - dynamodb_config=( - None - if args.storage_backend == "jsonl" - else _dynamodb_config_from_args(args) - ), - rule_pack_version=RULE_PACK_VERSION, - manifest=manifest, - ) - result = run_local_scan(request) - scan_failure_count = sum( - 1 for tr in result.target_results if tr.status != "scanned" - ) - - # 6. Summary. - print(f"registered: {len(all_targets)}") - print(f"skipped (disabled): {len(skipped_disabled)}") - print(f"fetched ok: {len(fetched)}") - print(f"fetch failed: {len(fetch_failed)}") - for target, reason in fetch_failed: - print(f" fetch failed: {target.url} ({reason})") - if result is not None: - print(f"scanned ok: {result.scanned}") - print(f"scanned failed: {scan_failure_count}") - for tr in result.target_results: - if tr.status != "scanned": - print(f" scan failed: {tr.target_name} ({tr.error})") - print(f"total findings: {result.total_findings}") - else: - print("scanned ok: 0") - print("scanned failed: 0") - print("total findings: 0") - - # 7. Exit code (spec §8). - exit_code = 2 if (fetch_failed or scan_failure_count) else 0 - - # 8. Notification log — finding records first, then summary. - completed_at = now_factory() - scan_run_id = result.scan_run_id if result is not None else None - scan_at_iso = result.scan_at_iso if result is not None else None - scanned_ok: list[str] = [] - scanned_failed: list[str] = [] - if result is not None: - for tr in result.target_results: - if tr.status == "scanned": - scanned_ok.append(tr.target_name) - for f in tr.findings: - write_record( - log_path, - finding_record( - event_at=now_factory(), - scan_run_id=scan_run_id, - finding_id=f.finding_id, - repo=f.repo.full_name, - rule_id=f.rule_id, - file_path=f.location.file_path, - line_start=f.location.line_start, - fingerprint=f.fingerprint, - scan_at=scan_at_iso or completed_at, - ), - ) - else: - scanned_failed.append(tr.target_name) - - write_record( - log_path, - summary_record( - event_at=completed_at, - scan_run_id=scan_run_id, - started_at=started_at, - completed_at=completed_at, - exit_code=exit_code, - registered_count=len(all_targets), - fetched_ok=[t.url for t, _ in fetched], - fetch_failed=[{"url": t.url, "error": err} for t, err in fetch_failed], - scanned_ok=scanned_ok, - scanned_failed=scanned_failed, - findings_total=(result.total_findings if result is not None else 0), - skipped_disabled=skipped_disabled, ), + dynamodb_config=( + None if args.storage_backend == "jsonl" else _dynamodb_config_from_args(args) + ), + notification_log_path=args.notification_log, + lock_path=SCAN_ALL_LOCK_PATH, + fetch_repo=fetch_or_clone, + scan_runner=run_local_scan, + now_factory=now_factory, + rule_pack_version=RULE_PACK_VERSION, ) - - return exit_code - finally: - lock_cm.__exit__(None, None, None) + ) + _render_scan_all_result(result) + return result.exit_code + + +def _render_scan_all_result(result: ScanAllResult) -> None: + if result.status == "lock_contention": + print(result.error or "another scan-all is running", file=sys.stderr) + return + if result.status in {"lock_failure", "catalog_failure"}: + print(f"error: {result.error}", file=sys.stderr) + return + if result.status == "no_targets": + print("no targets to scan") + return + + scan_result = result.scan_result + print(f"registered: {result.registered_count}") + print(f"skipped (disabled): {len(result.skipped_disabled)}") + print(f"fetched ok: {len(result.fetched)}") + print(f"fetch failed: {len(result.fetch_failed)}") + for failure in result.fetch_failed: + print(f" fetch failed: {failure.target.url} ({failure.error})") + if scan_result is not None: + print(f"scanned ok: {scan_result.scanned}") + print(f"scanned failed: {result.scan_failure_count}") + for target_result in scan_result.target_results: + if target_result.status != "scanned": + print(f" scan failed: {target_result.target_name} ({target_result.error})") + print(f"total findings: {scan_result.total_findings}") + else: + print("scanned ok: 0") + print("scanned failed: 0") + print("total findings: 0") def build_parser() -> argparse.ArgumentParser: @@ -951,6 +1018,157 @@ def build_parser() -> argparse.ArgumentParser: _add_storage_args(init_parser, include_jsonl_path="") init_parser.set_defaults(func=cmd_init_storage) + discover_parser = subparsers.add_parser( + "discover-updates", + help="Fetch enabled catalog targets and discover incremental scan jobs.", + ) + discover_mode = discover_parser.add_mutually_exclusive_group(required=True) + discover_mode.add_argument( + "--initialize", + action="store_true", + help="Write REF_STATE observations without enqueueing jobs.", + ) + discover_mode.add_argument( + "--enqueue", + action="store_true", + help="Enqueue one SCAN_JOB per newly observed unscanned commit.", + ) + discover_parser.add_argument( + "--max-targets", + type=int, + default=None, + metavar="N", + help="Maximum enabled targets to process.", + ) + discover_parser.add_argument( + "--ref-pattern", + action="append", + default=None, + metavar="PATTERN", + help="Remote ref glob to observe; may be repeated.", + ) + _add_incremental_storage_args(discover_parser) + discover_parser.set_defaults(func=cmd_discover_updates) + + scan_worker_parser = subparsers.add_parser( + "scan-worker", + help="Lease and process incremental scan jobs.", + ) + scan_worker_parser.add_argument( + "--once", + action="store_true", + help="Process at most --max-jobs jobs and exit.", + ) + scan_worker_parser.add_argument( + "--max-jobs", + type=int, + default=1, + metavar="N", + help="Maximum jobs to process in this invocation (default: 1).", + ) + scan_worker_parser.add_argument( + "--lease-seconds", + type=int, + default=300, + metavar="N", + help="Job and repository lease duration in seconds (default: 300).", + ) + scan_worker_parser.add_argument( + "--worker-id", + default=None, + metavar="ID", + help="Worker identifier stored on leases.", + ) + _add_incremental_storage_args(scan_worker_parser) + scan_worker_parser.set_defaults(func=cmd_scan_worker) + + queue_status_parser = subparsers.add_parser( + "queue-status", + help="Show incremental queue job and lease counts.", + ) + _add_incremental_storage_args(queue_status_parser) + queue_status_parser.set_defaults(func=cmd_queue_status) + + doctor_parser = subparsers.add_parser( + "doctor", + help="Check local binaries and optional private SCM auth.", + ) + doctor_parser.add_argument( + "--target-url", + metavar="URL", + default=None, + help="Repository URL to check SCM host/auth assumptions for.", + ) + doctor_parser.add_argument( + "--private", + action="store_true", + help="Require private repository auth for --target-url.", + ) + doctor_parser.add_argument( + "--scm-provider", + choices=["auto", "github", "gitlab"], + default=os.environ.get("SECURITY_SCANNER_SCM_PROVIDER", "auto"), + help="SCM provider hint for custom domains (default: auto).", + ) + doctor_parser.set_defaults(func=cmd_doctor) + + quickstart_parser = subparsers.add_parser( + "quickstart", + help="Bootstrap storage, register a target, enqueue current tip, and run worker.", + ) + quickstart_parser.add_argument("url", help="Repository URL to scan.") + quickstart_parser.add_argument( + "--name", + metavar="NAME", + default=None, + help="Display name for the target.", + ) + quickstart_parser.add_argument( + "--private", + action="store_true", + help="Run private SCM auth preflight before fetching.", + ) + quickstart_parser.add_argument( + "--scm-provider", + choices=["auto", "github", "gitlab"], + default=os.environ.get("SECURITY_SCANNER_SCM_PROVIDER", "auto"), + help="SCM provider hint for custom domains (default: auto).", + ) + quickstart_parser.add_argument( + "--max-jobs", + type=int, + default=10, + metavar="N", + help="Maximum jobs the quickstart worker processes (default: 10).", + ) + quickstart_parser.add_argument( + "--lease-seconds", + type=int, + default=300, + metavar="N", + help="Job and repository lease duration in seconds (default: 300).", + ) + quickstart_parser.add_argument( + "--worker-id", + default="quickstart-worker", + metavar="ID", + help="Worker identifier stored on leases.", + ) + quickstart_parser.add_argument( + "--storage-wait-seconds", + type=int, + default=30, + metavar="N", + help="Seconds to wait for DynamoDB Local readiness (default: 30).", + ) + quickstart_parser.add_argument( + "--no-worker", + action="store_true", + help="Only bootstrap/register/enqueue/status; do not run scan-worker.", + ) + _add_incremental_storage_args(quickstart_parser) + quickstart_parser.set_defaults(func=cmd_quickstart) + # add-target subcommand add_target_parser = subparsers.add_parser( "add-target", diff --git a/src/security_scanner/core/scan/options.py b/src/security_scanner/core/scan/options.py index 85e6ab0..896eafc 100644 --- a/src/security_scanner/core/scan/options.py +++ b/src/security_scanner/core/scan/options.py @@ -24,6 +24,9 @@ class ScanOptions: Glob patterns for paths to exclude from the scan. NOTE(phase2): These are accepted but not yet applied to gitleaks command args — future milestone will map them to gitleaks allowlist flags. + git_log_opts: + Optional git log selector passed to gitleaks git mode. + Used by incremental commit workers to scan one commit. enable_noise_filter: When True (default), parser-level Gitleaks noise filtering removes low-signal candidates before storage and optional verifier steps. @@ -33,4 +36,5 @@ class ScanOptions: include_history: bool = True exclude: list[str] = field(default_factory=list) + git_log_opts: str | None = None enable_noise_filter: bool = True diff --git a/src/security_scanner/runtime/doctor.py b/src/security_scanner/runtime/doctor.py new file mode 100644 index 0000000..5faea88 --- /dev/null +++ b/src/security_scanner/runtime/doctor.py @@ -0,0 +1,146 @@ +"""Local runtime preflight checks.""" + +from __future__ import annotations + +import os +import shutil +import subprocess +from dataclasses import dataclass +from urllib.parse import urlsplit + +from security_scanner.targets.fetcher import ( + ScmProvider, + UnsupportedHostError, + parse_owner_repo, + resolve_scm_provider, +) + + +@dataclass(frozen=True) +class DoctorCheck: + """One preflight check result.""" + + name: str + ok: bool + detail: str + + +@dataclass(frozen=True) +class DoctorResult: + """Preflight report for local runtime readiness.""" + + checks: list[DoctorCheck] + + @property + def ok(self) -> bool: + """Return whether every check passed.""" + return all(check.ok for check in self.checks) + + +def run_doctor( + *, + target_url: str | None = None, + private: bool = False, + scm_provider: ScmProvider = "auto", +) -> DoctorResult: + """Check local binaries and optional private SCM auth readiness.""" + checks = [ + _binary_check("git"), + _binary_check("gitleaks"), + ] + + if target_url: + try: + host, owner, repo = parse_owner_repo(target_url) + except UnsupportedHostError as exc: + checks.append(DoctorCheck("target url", False, str(exc))) + return DoctorResult(checks) + + if private: + checks.append(_private_auth_check(host, owner, repo, target_url, scm_provider)) + else: + checks.append( + DoctorCheck( + "public clone fallback", + shutil.which("git") is not None, + "public HTTPS clone can use git without gh/glab auth", + ) + ) + + return DoctorResult(checks) + + +def render_doctor_result(result: DoctorResult) -> str: + """Render a stable preflight report.""" + lines = [ + f"{'ok' if check.ok else 'fail'}: {check.name} - {check.detail}" + for check in result.checks + ] + return "\n".join(lines) + "\n" + + +def _binary_check(binary: str) -> DoctorCheck: + path = shutil.which(binary) + if path: + return DoctorCheck(binary, True, path) + return DoctorCheck(binary, False, f"{binary} binary not found on PATH") + + +def _private_auth_check( + host: str, + owner: str, + repo: str, + target_url: str, + scm_provider: ScmProvider, +) -> DoctorCheck: + try: + provider = resolve_scm_provider(host, scm_provider) + except UnsupportedHostError as exc: + return DoctorCheck("scm auth", False, str(exc)) + + if provider == "github": + if shutil.which("gh") is None: + return DoctorCheck( + "github auth", + False, + "install gh, then set GH_TOKEN or run gh auth login", + ) + return _auth_status_check( + name="github auth", + cmd=["gh", "repo", "view", f"{owner}/{repo}", "--json", "name"], + failure="set GH_TOKEN or run gh auth login, then verify target access", + ) + + if provider == "gitlab": + if shutil.which("glab") is None: + return DoctorCheck( + "gitlab auth", + False, + "install glab, then set GITLAB_TOKEN or run glab auth login", + ) + env = os.environ.copy() + if host != "gitlab.com": + parts = urlsplit(target_url) + env["GITLAB_HOST"] = parts.netloc + return _auth_status_check( + name="gitlab auth", + cmd=["glab", "repo", "view", target_url], + failure="set GITLAB_TOKEN or run glab auth login, then verify target access", + env=env, + ) + + return DoctorCheck("scm auth", False, f"unsupported SCM provider: {provider}") + + +def _auth_status_check( + *, + name: str, + cmd: list[str], + failure: str, + env: dict[str, str] | None = None, +) -> DoctorCheck: + try: + subprocess.run(cmd, check=True, capture_output=True, text=True, env=env) + except (FileNotFoundError, subprocess.CalledProcessError): + return DoctorCheck(name, False, failure) + return DoctorCheck(name, True, f"{cmd[0]} target access passed") diff --git a/src/security_scanner/runtime/incremental_discovery.py b/src/security_scanner/runtime/incremental_discovery.py new file mode 100644 index 0000000..2b00df0 --- /dev/null +++ b/src/security_scanner/runtime/incremental_discovery.py @@ -0,0 +1,366 @@ +"""Incremental ref discovery and scan job enqueueing.""" + +from __future__ import annotations + +import datetime as dt +import fnmatch +import subprocess +from dataclasses import dataclass, field +from pathlib import Path +from typing import Callable, Protocol, Sequence + +from security_scanner.catalog.scan_target import ScanTarget +from security_scanner.storage.adapters.nosql_db.items import ( + SCAN_JOB_STATUS_PENDING, + normalize_scan_target_url, + repo_id_for_scan_target_url, + scan_job_id_for, +) +from security_scanner.storage.base import ( + IncrementalScanStore, + RefState, + ScanJob, + ScanLedgerKey, +) + + +DISCOVERY_MODE_INITIALIZE = "initialize" +DISCOVERY_MODE_ENQUEUE = "enqueue" +DEFAULT_REF_PATTERNS = ("refs/remotes/origin/*",) +DEFAULT_JOB_PRIORITY = 100 +DEFAULT_MAX_ATTEMPTS = 3 + + +class GitDiscoveryError(RuntimeError): + """Raised when local git discovery commands fail.""" + + +@dataclass(frozen=True) +class GitRef: + """One remote ref observed in a local repository cache.""" + + ref_name: str + commit_sha: str + + +class GitDiscovery(Protocol): + """Git operations needed by incremental discovery.""" + + def fetch(self, repo_path: Path) -> None: + """Fetch remote refs into the local cache.""" + + def list_remote_refs( + self, + repo_path: Path, + patterns: Sequence[str], + ) -> list[GitRef]: + """List remote refs matching the configured patterns.""" + + def is_ancestor(self, repo_path: Path, old_sha: str, new_sha: str) -> bool: + """Return whether old_sha is an ancestor of new_sha.""" + + def list_new_commits( + self, + repo_path: Path, + old_sha: str, + new_sha: str, + ) -> list[str]: + """Return commit SHAs in old_sha..new_sha.""" + + +class SubprocessGitDiscovery: + """GitDiscovery implementation backed by the git binary.""" + + def fetch(self, repo_path: Path) -> None: + self._run(repo_path, ["fetch", "--all", "--prune"]) + + def list_remote_refs( + self, + repo_path: Path, + patterns: Sequence[str], + ) -> list[GitRef]: + output = self._run( + repo_path, + [ + "for-each-ref", + "--format=%(refname) %(objectname)", + "refs/remotes/origin", + ], + ) + refs: list[GitRef] = [] + for line in output.splitlines(): + if not line.strip(): + continue + ref_name, commit_sha = line.split(" ", 1) + if _ref_matches_patterns(ref_name, patterns): + refs.append(GitRef(ref_name=ref_name, commit_sha=commit_sha)) + return refs + + def is_ancestor(self, repo_path: Path, old_sha: str, new_sha: str) -> bool: + cmd = ["git", "-C", str(repo_path), "merge-base", "--is-ancestor", old_sha, new_sha] + try: + subprocess.run(cmd, check=True, capture_output=True, text=True) + return True + except subprocess.CalledProcessError as exc: + if exc.returncode == 1: + return False + detail = (exc.stderr or exc.stdout or "no process output").strip() + raise GitDiscoveryError( + f"git merge-base failed with exit code {exc.returncode}: {detail}" + ) from exc + except FileNotFoundError as exc: + raise GitDiscoveryError("git binary not found on PATH") from exc + + def list_new_commits( + self, + repo_path: Path, + old_sha: str, + new_sha: str, + ) -> list[str]: + output = self._run(repo_path, ["rev-list", f"{old_sha}..{new_sha}"]) + return [line.strip() for line in output.splitlines() if line.strip()] + + def _run(self, repo_path: Path, args: Sequence[str]) -> str: + cmd = ["git", "-C", str(repo_path), *args] + try: + result = subprocess.run(cmd, check=True, capture_output=True, text=True) + except FileNotFoundError as exc: + raise GitDiscoveryError("git binary not found on PATH") from exc + except subprocess.CalledProcessError as exc: + detail = (exc.stderr or exc.stdout or "no process output").strip() + raise GitDiscoveryError( + f"git {args[0]} failed with exit code {exc.returncode}: {detail}" + ) from exc + return result.stdout + + +@dataclass(frozen=True) +class DiscoveryScannerConfig: + """Scanner tuple used to dedupe discovery-created jobs.""" + + scanner_name: str + scanner_version: str + rule_pack_version: str + scanner_config_hash: str + + +@dataclass(frozen=True) +class FetchFailure: + """Per-target fetch failure reported without aborting other targets.""" + + target: ScanTarget + error: str + + +@dataclass(frozen=True) +class IncrementalDiscoverySummary: + """Operator-facing summary for one discovery invocation.""" + + targets: int + fetch_ok: int = 0 + fetch_failed: list[FetchFailure] = field(default_factory=list) + refs_observed: int = 0 + jobs_enqueued: int = 0 + ledger_skipped: int = 0 + skipped_non_fast_forward: int = 0 + + @property + def fetch_failed_count(self) -> int: + """Return failed target count.""" + return len(self.fetch_failed) + + @property + def has_partial_failure(self) -> bool: + """Return whether the CLI should exit with code 2.""" + return bool(self.fetch_failed) or self.skipped_non_fast_forward > 0 + + +@dataclass(frozen=True) +class IncrementalDiscoveryRequest: + """Inputs for incremental discovery orchestration.""" + + mode: str + store: IncrementalScanStore + fetch_repo: Callable[[str], Path] + git: GitDiscovery + scanner: DiscoveryScannerConfig + max_targets: int | None = None + ref_patterns: Sequence[str] = DEFAULT_REF_PATTERNS + now_factory: Callable[[], dt.datetime] = lambda: dt.datetime.now(dt.UTC).replace( + microsecond=0 + ) + + +def run_incremental_discovery( + request: IncrementalDiscoveryRequest, +) -> IncrementalDiscoverySummary: + """Fetch enabled targets, observe refs, and optionally enqueue commit jobs.""" + if request.mode not in {DISCOVERY_MODE_INITIALIZE, DISCOVERY_MODE_ENQUEUE}: + raise ValueError(f"unsupported discovery mode: {request.mode}") + + targets = [target for target in request.store.list_scan_targets() if target.enabled] + if request.max_targets is not None: + targets = targets[: request.max_targets] + + summary = IncrementalDiscoverySummary(targets=len(targets)) + fetch_ok = 0 + fetch_failed: list[FetchFailure] = [] + refs_observed = 0 + jobs_enqueued = 0 + ledger_skipped = 0 + skipped_non_fast_forward = 0 + + for target in targets: + try: + repo_path = request.fetch_repo(target.url) + except Exception as exc: # noqa: BLE001 - per-target failures are isolated. + fetch_failed.append(FetchFailure(target=target, error=str(exc))) + continue + + repo_id = repo_id_for_scan_target_url(target.url) + repo_url = normalize_scan_target_url(target.url) + try: + refs = request.git.list_remote_refs(repo_path, request.ref_patterns) + except Exception as exc: # noqa: BLE001 - per-target discovery failure. + fetch_failed.append(FetchFailure(target=target, error=str(exc))) + continue + + fetch_ok += 1 + refs_observed += len(refs) + + for git_ref in refs: + current_state = request.store.get_ref_state(repo_id, git_ref.ref_name) + observed_state = RefState( + repo_id=repo_id, + repo_url=repo_url, + ref_name=git_ref.ref_name, + last_seen_sha=git_ref.commit_sha, + updated_at=_now(request), + ) + if request.mode == DISCOVERY_MODE_INITIALIZE or current_state is None: + request.store.put_ref_state(observed_state) + continue + if current_state.last_seen_sha == git_ref.commit_sha: + continue + if not request.git.is_ancestor( + repo_path, + current_state.last_seen_sha, + git_ref.commit_sha, + ): + skipped_non_fast_forward += 1 + continue + + commits = request.git.list_new_commits( + repo_path, + current_state.last_seen_sha, + git_ref.commit_sha, + ) + commit_range = f"{current_state.last_seen_sha}..{git_ref.commit_sha}" + for commit_sha in commits: + key = _scan_ledger_key( + repo_id=repo_id, + commit_sha=commit_sha, + scanner=request.scanner, + ) + if request.store.has_scan_ledger(key): + ledger_skipped += 1 + continue + job = _scan_job_for_commit( + repo_id=repo_id, + repo_url=repo_url, + ref_name=git_ref.ref_name, + old_sha=current_state.last_seen_sha, + new_sha=git_ref.commit_sha, + commit_sha=commit_sha, + commit_range=commit_range, + scanner=request.scanner, + now=_now(request), + ) + if request.store.enqueue_commit_scan_job(job): + jobs_enqueued += 1 + + request.store.put_ref_state(observed_state) + + return IncrementalDiscoverySummary( + targets=summary.targets, + fetch_ok=fetch_ok, + fetch_failed=fetch_failed, + refs_observed=refs_observed, + jobs_enqueued=jobs_enqueued, + ledger_skipped=ledger_skipped, + skipped_non_fast_forward=skipped_non_fast_forward, + ) + + +def _scan_ledger_key( + *, + repo_id: str, + commit_sha: str, + scanner: DiscoveryScannerConfig, +) -> ScanLedgerKey: + return ScanLedgerKey( + repo_id=repo_id, + commit_sha=commit_sha, + scanner_name=scanner.scanner_name, + scanner_version=scanner.scanner_version, + rule_pack_version=scanner.rule_pack_version, + scanner_config_hash=scanner.scanner_config_hash, + ) + + +def _scan_job_for_commit( + *, + repo_id: str, + repo_url: str, + ref_name: str, + old_sha: str, + new_sha: str, + commit_sha: str, + commit_range: str, + scanner: DiscoveryScannerConfig, + now: dt.datetime, +) -> ScanJob: + job_id = scan_job_id_for( + repo_id=repo_id, + commit_sha=commit_sha, + scanner_name=scanner.scanner_name, + scanner_version=scanner.scanner_version, + rule_pack_version=scanner.rule_pack_version, + scanner_config_hash=scanner.scanner_config_hash, + ) + return ScanJob( + job_id=job_id, + repo_id=repo_id, + repo_url=repo_url, + ref_name=ref_name, + old_sha=old_sha, + new_sha=new_sha, + commit_sha=commit_sha, + commit_range=commit_range, + scanner_name=scanner.scanner_name, + scanner_version=scanner.scanner_version, + rule_pack_version=scanner.rule_pack_version, + scanner_config_hash=scanner.scanner_config_hash, + priority=DEFAULT_JOB_PRIORITY, + status=SCAN_JOB_STATUS_PENDING, + attempts=0, + max_attempts=DEFAULT_MAX_ATTEMPTS, + worker_id=None, + lease_until=None, + next_attempt_at=now, + created_at=now, + updated_at=now, + ) + + +def _ref_matches_patterns(ref_name: str, patterns: Sequence[str]) -> bool: + if not patterns: + return True + return any(fnmatch.fnmatch(ref_name, pattern) for pattern in patterns) + + +def _now(request: IncrementalDiscoveryRequest) -> dt.datetime: + value = request.now_factory() + if value.tzinfo is None: + return value.replace(tzinfo=dt.UTC) + return value.astimezone(dt.UTC).replace(microsecond=0) diff --git a/src/security_scanner/runtime/queue_status.py b/src/security_scanner/runtime/queue_status.py new file mode 100644 index 0000000..c55b342 --- /dev/null +++ b/src/security_scanner/runtime/queue_status.py @@ -0,0 +1,61 @@ +"""Incremental queue status runtime.""" + +from __future__ import annotations + +import datetime as dt +from dataclasses import dataclass +from typing import Callable + +from security_scanner.storage.adapters.nosql_db.items import ( + SCAN_JOB_STATUS_COMPLETED, + SCAN_JOB_STATUS_DEAD_LETTER, + SCAN_JOB_STATUS_LEASED, + SCAN_JOB_STATUS_PENDING, +) +from security_scanner.storage.base import IncrementalScanStore, QueueStatus + + +QUEUE_STATUS_ORDER = ( + SCAN_JOB_STATUS_PENDING, + SCAN_JOB_STATUS_LEASED, + SCAN_JOB_STATUS_COMPLETED, + SCAN_JOB_STATUS_DEAD_LETTER, +) + + +@dataclass(frozen=True) +class QueueStatusRequest: + """Inputs for one queue-status read.""" + + store: IncrementalScanStore + now_factory: Callable[[], dt.datetime] = lambda: dt.datetime.now(dt.UTC).replace( + microsecond=0 + ) + + +def read_queue_status(request: QueueStatusRequest) -> QueueStatus: + """Return queue status counts using the storage model's clock-sensitive view.""" + return request.store.get_queue_status(_now(request)) + + +def render_queue_status(status: QueueStatus) -> str: + """Render a stable human-readable queue status report.""" + lines = [ + f"{job_status}: {status.job_counts_by_status.get(job_status, 0)}" + for job_status in QUEUE_STATUS_ORDER + ] + extra_statuses = sorted(set(status.job_counts_by_status) - set(QUEUE_STATUS_ORDER)) + lines.extend( + f"{job_status}: {status.job_counts_by_status[job_status]}" + for job_status in extra_statuses + ) + lines.append(f"expired job leases: {status.expired_job_leases}") + lines.append(f"expired repo leases: {status.expired_repo_leases}") + return "\n".join(lines) + "\n" + + +def _now(request: QueueStatusRequest) -> dt.datetime: + value = request.now_factory() + if value.tzinfo is None: + return value.replace(tzinfo=dt.UTC) + return value.astimezone(dt.UTC).replace(microsecond=0) diff --git a/src/security_scanner/runtime/quickstart.py b/src/security_scanner/runtime/quickstart.py new file mode 100644 index 0000000..994bcab --- /dev/null +++ b/src/security_scanner/runtime/quickstart.py @@ -0,0 +1,269 @@ +"""Turnkey local quickstart runtime.""" + +from __future__ import annotations + +import datetime as dt +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Callable, Protocol + +from security_scanner.catalog.scan_target import ScanTarget +from security_scanner.runtime.incremental_discovery import ( + DEFAULT_JOB_PRIORITY, + DEFAULT_MAX_ATTEMPTS, + DEFAULT_REF_PATTERNS, + DISCOVERY_MODE_ENQUEUE, + DISCOVERY_MODE_INITIALIZE, + DiscoveryScannerConfig, + GitDiscovery, + IncrementalDiscoveryRequest, + IncrementalDiscoverySummary, + run_incremental_discovery, +) +from security_scanner.runtime.queue_status import QueueStatusRequest, read_queue_status +from security_scanner.runtime.scan_worker import ( + ScanWorkerRequest, + ScanWorkerSummary, +) +from security_scanner.storage.adapters.nosql_db.items import ( + SCAN_JOB_STATUS_PENDING, + normalize_scan_target_url, + repo_id_for_scan_target_url, + scan_job_id_for, +) +from security_scanner.storage.base import QueueStatus, ScanJob + + +class QuickstartStore(Protocol): + """Storage capabilities needed by quickstart.""" + + def bootstrap(self) -> None: + """Create storage resources.""" + + def put_scan_target(self, target: ScanTarget) -> None: + """Persist a target.""" + + def enqueue_commit_scan_job(self, job: ScanJob) -> bool: + """Persist a job if absent.""" + + +@dataclass(frozen=True) +class QuickstartRequest: + """Inputs for a turnkey local scan bootstrap.""" + + target_url: str + target_name: str | None + store: QuickstartStore + fetch_repo: Callable[[str], Path] + git: GitDiscovery + scanner_config: DiscoveryScannerConfig + worker_request_factory: Callable[[int], ScanWorkerRequest] | None = None + run_worker: bool = True + max_jobs: int = 10 + storage_wait_seconds: int = 30 + now_factory: Callable[[], dt.datetime] = lambda: dt.datetime.now(dt.UTC).replace( + microsecond=0 + ) + + +@dataclass(frozen=True) +class QuickstartSummary: + """Operator-facing quickstart summary.""" + + target_url: str + initialized: IncrementalDiscoverySummary + current_jobs_enqueued: int + enqueued: IncrementalDiscoverySummary + worker: ScanWorkerSummary | None + status: QueueStatus + + +def run_quickstart(request: QuickstartRequest) -> QuickstartSummary: + """Bootstrap storage, register a target, enqueue current tip, and process jobs.""" + _bootstrap_with_wait(request.store, request.storage_wait_seconds) + + target = ScanTarget( + url=normalize_scan_target_url(request.target_url), + name=request.target_name or _default_target_name(request.target_url), + enabled=True, + ) + request.store.put_scan_target(target) + scoped_store = _SingleTargetStore(request.store, target) + + initialized = run_incremental_discovery( + IncrementalDiscoveryRequest( + mode=DISCOVERY_MODE_INITIALIZE, + store=scoped_store, + fetch_repo=request.fetch_repo, + git=request.git, + scanner=request.scanner_config, + ref_patterns=DEFAULT_REF_PATTERNS, + now_factory=request.now_factory, + ) + ) + current_jobs_enqueued = _enqueue_current_tip_jobs(request, target) + enqueued = run_incremental_discovery( + IncrementalDiscoveryRequest( + mode=DISCOVERY_MODE_ENQUEUE, + store=scoped_store, + fetch_repo=request.fetch_repo, + git=request.git, + scanner=request.scanner_config, + ref_patterns=DEFAULT_REF_PATTERNS, + now_factory=request.now_factory, + ) + ) + + worker_summary = None + if request.run_worker: + if request.worker_request_factory is None: + raise ValueError("worker_request_factory is required when run_worker is true") + worker_summary = run_worker_once(request.worker_request_factory(request.max_jobs)) + + status = read_queue_status( + QueueStatusRequest(store=request.store, now_factory=request.now_factory) + ) + return QuickstartSummary( + target_url=target.url, + initialized=initialized, + current_jobs_enqueued=current_jobs_enqueued, + enqueued=enqueued, + worker=worker_summary, + status=status, + ) + + +def render_quickstart_summary(summary: QuickstartSummary) -> str: + """Render a stable quickstart report.""" + worker = summary.worker or ScanWorkerSummary() + return ( + f"target: {summary.target_url}\n" + f"initialized refs: {summary.initialized.refs_observed}\n" + f"current jobs enqueued: {summary.current_jobs_enqueued}\n" + f"discovery jobs enqueued: {summary.enqueued.jobs_enqueued}\n" + f"worker leased: {worker.leased}\n" + f"worker completed: {worker.completed}\n" + f"pending: {summary.status.job_counts_by_status.get('pending', 0)}\n" + f"completed: {summary.status.job_counts_by_status.get('completed', 0)}\n" + ) + + +def run_worker_once(request: ScanWorkerRequest) -> ScanWorkerSummary: + """Small indirection so tests can monkeypatch worker execution.""" + from security_scanner.runtime.scan_worker import run_scan_worker_once + + return run_scan_worker_once(request) + + +class _SingleTargetStore: + """Adapter that scopes discovery to the quickstart target only.""" + + def __init__(self, store: QuickstartStore, target: ScanTarget) -> None: + self._store = store + self._target = target + + def list_scan_targets(self) -> list[ScanTarget]: + return [self._target] + + def __getattr__(self, name: str): + return getattr(self._store, name) + + +def _enqueue_current_tip_jobs( + request: QuickstartRequest, + target: ScanTarget, +) -> int: + repo_path = request.fetch_repo(target.url) + refs = request.git.list_remote_refs(repo_path, DEFAULT_REF_PATTERNS) + repo_id = repo_id_for_scan_target_url(target.url) + repo_url = normalize_scan_target_url(target.url) + enqueued = 0 + for git_ref in refs: + job = _current_tip_job( + repo_id=repo_id, + repo_url=repo_url, + ref_name=git_ref.ref_name, + commit_sha=git_ref.commit_sha, + scanner=request.scanner_config, + now=_now(request), + ) + if request.store.enqueue_commit_scan_job(job): + enqueued += 1 + return enqueued + + +def _current_tip_job( + *, + repo_id: str, + repo_url: str, + ref_name: str, + commit_sha: str, + scanner: DiscoveryScannerConfig, + now: dt.datetime, +) -> ScanJob: + job_id = scan_job_id_for( + repo_id=repo_id, + commit_sha=commit_sha, + scanner_name=scanner.scanner_name, + scanner_version=scanner.scanner_version, + rule_pack_version=scanner.rule_pack_version, + scanner_config_hash=scanner.scanner_config_hash, + ) + return ScanJob( + job_id=job_id, + repo_id=repo_id, + repo_url=repo_url, + ref_name=ref_name, + old_sha=None, + new_sha=commit_sha, + commit_sha=commit_sha, + commit_range=None, + scanner_name=scanner.scanner_name, + scanner_version=scanner.scanner_version, + rule_pack_version=scanner.rule_pack_version, + scanner_config_hash=scanner.scanner_config_hash, + priority=DEFAULT_JOB_PRIORITY, + status=SCAN_JOB_STATUS_PENDING, + attempts=0, + max_attempts=DEFAULT_MAX_ATTEMPTS, + worker_id=None, + lease_until=None, + next_attempt_at=now, + created_at=now, + updated_at=now, + ) + + +def _bootstrap_with_wait(store: QuickstartStore, wait_seconds: int) -> None: + deadline = time.monotonic() + max(wait_seconds, 0) + last_error: Exception | None = None + while True: + try: + store.bootstrap() + return + except Exception as exc: # noqa: BLE001 - backend readiness is time-bound. + last_error = exc + if time.monotonic() >= deadline: + break + time.sleep(1) + raise RuntimeError( + "storage bootstrap failed; start DynamoDB Local and retry" + ) from last_error + + +def _default_target_name(url: str) -> str: + path = normalize_scan_target_url(url).rsplit("://", 1)[-1] + segments = [s for s in path.split("/") if s] + if len(segments) >= 3: + return "/".join(segments[-2:]) + if len(segments) == 2: + return segments[-1] + return url + + +def _now(request: QuickstartRequest) -> dt.datetime: + value = request.now_factory() + if value.tzinfo is None: + return value.replace(tzinfo=dt.UTC) + return value.astimezone(dt.UTC).replace(microsecond=0) diff --git a/src/security_scanner/runtime/scan_all.py b/src/security_scanner/runtime/scan_all.py new file mode 100644 index 0000000..4da5e0d --- /dev/null +++ b/src/security_scanner/runtime/scan_all.py @@ -0,0 +1,374 @@ +"""Scan every enabled catalog target as one batch runtime use case.""" + +from __future__ import annotations + +import contextlib +import datetime as dt +import fcntl +from dataclasses import dataclass, field +from pathlib import Path +from typing import Callable, Protocol + +from security_scanner.catalog.scan_target import ScanTarget +from security_scanner.core.scan.options import ScanOptions +from security_scanner.runtime.local_scan import ( + LocalScanRequest, + LocalScanResult, + RULE_PACK_VERSION, + run_local_scan, +) +from security_scanner.runtime.notification_log import ( + fatal_error_record, + finding_record, + lock_contention_record, + summary_record, + write_record, +) +from security_scanner.storage.adapters.nosql_db.transport import ( + DynamoDbCompatibleConfig, +) +from security_scanner.targets.fetcher import FetchError, fetch_or_clone +from security_scanner.targets.manifest import Manifest, Target + + +DEFAULT_SCAN_ALL_LOCK_PATH: Path = ( + Path.home() / ".cache" / "security-scanner" / ".scan-all.lock" +) + +FetchRepo = Callable[[str], Path] +RunLocalScan = Callable[[LocalScanRequest], LocalScanResult] +NowFactory = Callable[[], str] +NotificationWriter = Callable[[Path, dict], None] +StoreFactory = Callable[[], "ScanTargetCatalog"] + + +class ScanTargetCatalog(Protocol): + def list_scan_targets(self) -> list[ScanTarget]: + """Return catalog targets.""" + + +def utc_now_iso() -> str: + """ISO-8601 UTC timestamp without microseconds.""" + return dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat() + + +def default_notification_writer() -> NotificationWriter: + """Return the production notification writer.""" + return write_record + + +@dataclass(frozen=True) +class ScanAllFetchFailure: + """Per-target fetch failure captured without aborting the batch.""" + + target: ScanTarget + error: str + + +@dataclass(frozen=True) +class ScanAllRequest: + """Inputs needed to run scan-all without CLI presentation concerns.""" + + store_factory: StoreFactory + storage_backend: str + output_destination: str + notification_log_path: str | Path + dynamodb_config: DynamoDbCompatibleConfig | None = None + lock_path: str | Path = DEFAULT_SCAN_ALL_LOCK_PATH + fetch_repo: FetchRepo = fetch_or_clone + scan_runner: RunLocalScan = run_local_scan + now_factory: NowFactory = utc_now_iso + notification_writer: NotificationWriter = field( + default_factory=default_notification_writer + ) + rule_pack_version: str = RULE_PACK_VERSION + + +@dataclass(frozen=True) +class ScanAllResult: + """Renderable scan-all outcome.""" + + exit_code: int + status: str + registered_count: int = 0 + skipped_disabled: list[str] = field(default_factory=list) + fetched: list[tuple[ScanTarget, Path]] = field(default_factory=list) + fetch_failed: list[ScanAllFetchFailure] = field(default_factory=list) + scan_result: LocalScanResult | None = None + scan_failure_count: int = 0 + error: str | None = None + + +class _LockContention(RuntimeError): + """Raised when the scan-all lock is held by another process.""" + + +def run_scan_all(request: ScanAllRequest) -> ScanAllResult: + """Run scan-all batch orchestration and write notification log records.""" + log_path = Path(request.notification_log_path) + lock_path = Path(request.lock_path) + started_at = request.now_factory() + + try: + lock_cm = _scan_all_lock(lock_path) + lock_cm.__enter__() + except _LockContention: + event_at = request.now_factory() + request.notification_writer( + log_path, + lock_contention_record( + event_at=event_at, + lock_path=str(lock_path), + ), + ) + return ScanAllResult( + exit_code=3, + status="lock_contention", + error="another scan-all is running", + ) + except OSError as exc: + request.notification_writer( + log_path, + fatal_error_record( + event_at=request.now_factory(), + scan_run_id=None, + error=f"Lock acquisition failed: {exc}", + stage="lock_acquisition", + ), + ) + return ScanAllResult( + exit_code=1, + status="lock_failure", + error=f"failed to acquire lock: {exc}", + ) + + try: + return _run_scan_all_locked(request, started_at, log_path) + finally: + lock_cm.__exit__(None, None, None) + + +@contextlib.contextmanager +def _scan_all_lock(path: Path): + """Acquire a non-blocking flock on ``path``. Releases on exit.""" + path.parent.mkdir(parents=True, exist_ok=True) + fd = open(path, "w") + try: + try: + fcntl.flock(fd.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB) + except BlockingIOError as exc: + fd.close() + raise _LockContention() from exc + try: + yield + finally: + try: + fcntl.flock(fd.fileno(), fcntl.LOCK_UN) + except OSError: + # The lock file descriptor is closing anyway; scan work already ended. + pass + finally: + if not fd.closed: + fd.close() + + +def _run_scan_all_locked( + request: ScanAllRequest, + started_at: str, + log_path: Path, +) -> ScanAllResult: + try: + all_targets = request.store_factory().list_scan_targets() + except Exception as exc: # noqa: BLE001 - storage fatal per spec. + request.notification_writer( + log_path, + fatal_error_record( + event_at=request.now_factory(), + scan_run_id=None, + error=str(exc), + stage="catalog_lookup", + ), + ) + return ScanAllResult( + exit_code=1, + status="catalog_failure", + error=f"catalog lookup failed: {exc}", + ) + + targets = [target for target in all_targets if target.enabled] + skipped_disabled = [target.url for target in all_targets if not target.enabled] + + if not targets: + completed_at = request.now_factory() + request.notification_writer( + log_path, + summary_record( + event_at=completed_at, + scan_run_id=None, + started_at=started_at, + completed_at=completed_at, + exit_code=0, + registered_count=len(all_targets), + fetched_ok=[], + fetch_failed=[], + scanned_ok=[], + scanned_failed=[], + findings_total=0, + skipped_disabled=skipped_disabled, + ), + ) + return ScanAllResult( + exit_code=0, + status="no_targets", + registered_count=len(all_targets), + skipped_disabled=skipped_disabled, + ) + + fetched, fetch_failed = _fetch_targets(targets, request.fetch_repo) + scan_result = _run_local_scan_for_fetched(request, fetched) + scan_failure_count = _scan_failure_count(scan_result) + exit_code = 2 if (fetch_failed or scan_failure_count) else 0 + + _write_finding_and_summary_records( + request=request, + log_path=log_path, + started_at=started_at, + all_targets=all_targets, + skipped_disabled=skipped_disabled, + fetched=fetched, + fetch_failed=fetch_failed, + scan_result=scan_result, + exit_code=exit_code, + ) + + return ScanAllResult( + exit_code=exit_code, + status="completed", + registered_count=len(all_targets), + skipped_disabled=skipped_disabled, + fetched=fetched, + fetch_failed=fetch_failed, + scan_result=scan_result, + scan_failure_count=scan_failure_count, + ) + + +def _fetch_targets( + targets: list[ScanTarget], + fetch_repo: FetchRepo, +) -> tuple[list[tuple[ScanTarget, Path]], list[ScanAllFetchFailure]]: + fetched: list[tuple[ScanTarget, Path]] = [] + fetch_failed: list[ScanAllFetchFailure] = [] + for target in targets: + try: + cache_path = fetch_repo(target.url) + except FetchError as exc: + fetch_failed.append(ScanAllFetchFailure(target=target, error=str(exc))) + continue + except Exception as exc: # noqa: BLE001 - isolate per-target fetch errors. + fetch_failed.append( + ScanAllFetchFailure( + target=target, + error=f"unexpected error: {exc}", + ) + ) + continue + fetched.append((target, cache_path)) + return fetched, fetch_failed + + +def _run_local_scan_for_fetched( + request: ScanAllRequest, + fetched: list[tuple[ScanTarget, Path]], +) -> LocalScanResult | None: + if not fetched: + return None + manifest = Manifest( + version=1, + targets=[ + Target(name=target.name, path=str(cache), enabled=True) + for target, cache in fetched + ], + scan=ScanOptions(), + gitleaks_config=None, + ) + return request.scan_runner( + LocalScanRequest( + manifest_path="", + output_destination=request.output_destination, + storage_backend=request.storage_backend, + dynamodb_config=( + None if request.storage_backend == "jsonl" else request.dynamodb_config + ), + rule_pack_version=request.rule_pack_version, + manifest=manifest, + ) + ) + + +def _scan_failure_count(result: LocalScanResult | None) -> int: + if result is None: + return 0 + return sum(1 for target in result.target_results if target.status != "scanned") + + +def _write_finding_and_summary_records( + *, + request: ScanAllRequest, + log_path: Path, + started_at: str, + all_targets: list[ScanTarget], + skipped_disabled: list[str], + fetched: list[tuple[ScanTarget, Path]], + fetch_failed: list[ScanAllFetchFailure], + scan_result: LocalScanResult | None, + exit_code: int, +) -> None: + completed_at = request.now_factory() + scan_run_id = scan_result.scan_run_id if scan_result is not None else None + scan_at_iso = scan_result.scan_at_iso if scan_result is not None else None + scanned_ok: list[str] = [] + scanned_failed: list[str] = [] + + if scan_result is not None: + for target_result in scan_result.target_results: + if target_result.status == "scanned": + scanned_ok.append(target_result.target_name) + for finding in target_result.findings: + request.notification_writer( + log_path, + finding_record( + event_at=request.now_factory(), + scan_run_id=scan_run_id, + finding_id=finding.finding_id, + repo=finding.repo.full_name, + rule_id=finding.rule_id, + file_path=finding.location.file_path, + line_start=finding.location.line_start, + fingerprint=finding.fingerprint, + scan_at=scan_at_iso or completed_at, + ), + ) + else: + scanned_failed.append(target_result.target_name) + + request.notification_writer( + log_path, + summary_record( + event_at=completed_at, + scan_run_id=scan_run_id, + started_at=started_at, + completed_at=completed_at, + exit_code=exit_code, + registered_count=len(all_targets), + fetched_ok=[target.url for target, _ in fetched], + fetch_failed=[ + {"url": failure.target.url, "error": failure.error} + for failure in fetch_failed + ], + scanned_ok=scanned_ok, + scanned_failed=scanned_failed, + findings_total=(scan_result.total_findings if scan_result else 0), + skipped_disabled=skipped_disabled, + ), + ) diff --git a/src/security_scanner/runtime/scan_worker.py b/src/security_scanner/runtime/scan_worker.py new file mode 100644 index 0000000..c8fb1a7 --- /dev/null +++ b/src/security_scanner/runtime/scan_worker.py @@ -0,0 +1,196 @@ +"""Incremental scan worker runtime.""" + +from __future__ import annotations + +import datetime as dt +import uuid +from dataclasses import dataclass +from pathlib import Path +from typing import Callable, Protocol + +from security_scanner.core.finding.model import Finding +from security_scanner.core.scan.options import ScanOptions +from security_scanner.scanners.gitleaks.scanner import GitleaksScanner +from security_scanner.storage.base import ( + IncrementalScanStore, + ScanJob, + ScanLedgerEntry, +) + + +DEFAULT_LEASE_SECONDS = 300 +DEFAULT_RETRY_DELAY_SECONDS = 60 + + +class CommitScanner(Protocol): + """Scanner capability needed by scan-worker.""" + + def scan( + self, + *, + repo_full_name: str, + root: Path, + scan_options: ScanOptions | None, + scan_run_id: str, + rule_pack_version: str, + ) -> list[Finding]: + """Scan a local checkout and return findings.""" + + +@dataclass(frozen=True) +class ScanWorkerRequest: + """Inputs for one bounded scan-worker invocation.""" + + store: IncrementalScanStore + fetch_repo: Callable[[str], Path] + scanner: CommitScanner + max_jobs: int = 1 + lease_seconds: int = DEFAULT_LEASE_SECONDS + worker_id: str | None = None + retry_delay_seconds: int = DEFAULT_RETRY_DELAY_SECONDS + now_factory: Callable[[], dt.datetime] = lambda: dt.datetime.now(dt.UTC).replace( + microsecond=0 + ) + + +@dataclass(frozen=True) +class ScanWorkerSummary: + """Operator-facing scan-worker summary.""" + + leased: int = 0 + completed: int = 0 + retryable: int = 0 + dead_lettered: int = 0 + + @property + def has_permanent_failure(self) -> bool: + """Return whether the CLI should exit 2.""" + return self.dead_lettered > 0 + + +def run_scan_worker_once(request: ScanWorkerRequest) -> ScanWorkerSummary: + """Lease and process up to max_jobs queued scan jobs.""" + worker_id = request.worker_id or f"worker_{uuid.uuid4().hex[:12]}" + leased_count = 0 + completed = 0 + retryable = 0 + dead_lettered = 0 + + for _ in range(max(request.max_jobs, 0)): + now = _now(request) + job = request.store.lease_next_scan_job( + worker_id=worker_id, + lease_seconds=request.lease_seconds, + now=now, + ) + if job is None: + break + + leased_count += 1 + + if request.store.has_scan_ledger(job.ledger_key): + request.store.complete_processed_job( + job, + findings=[], + ledger=_ledger_for_job(job, scanned_at=now, finding_count=0), + ) + completed += 1 + continue + + if not request.store.acquire_repo_lease( + job.repo_id, + worker_id, + request.lease_seconds, + ): + request.store.return_job_to_pending(job.job_id, "repo lease unavailable") + break + + try: + repo_path = request.fetch_repo(job.repo_url) + scan_run_id = _scan_run_id_for_job(job) + findings = request.scanner.scan( + repo_full_name=job.repo_id, + root=repo_path, + scan_options=ScanOptions( + include_history=True, + git_log_opts=f"{job.commit_sha}^!", + ), + scan_run_id=scan_run_id, + rule_pack_version=job.rule_pack_version, + ) + findings = [_finding_with_commit(finding, job.commit_sha) for finding in findings] + scanned_at = _now(request) + request.store.complete_processed_job( + job, + findings=findings, + ledger=_ledger_for_job( + job, + scanned_at=scanned_at, + finding_count=len(findings), + ), + ) + completed += 1 + except Exception as exc: # noqa: BLE001 - scanner/runtime failure is retryable until exhausted. + if job.attempts + 1 >= job.max_attempts: + dead_lettered += 1 + else: + retryable += 1 + request.store.record_retryable_failure( + job.job_id, + error=str(exc), + next_attempt_at=_now(request) + + dt.timedelta(seconds=request.retry_delay_seconds), + ) + finally: + request.store.release_repo_lease(job.repo_id, worker_id) + + return ScanWorkerSummary( + leased=leased_count, + completed=completed, + retryable=retryable, + dead_lettered=dead_lettered, + ) + + +def make_default_scanner() -> GitleaksScanner: + """Return the default commit scanner.""" + return GitleaksScanner() + + +def _scan_run_id_for_job(job: ScanJob) -> str: + return f"scan_run_{job.job_id}" + + +def _ledger_for_job( + job: ScanJob, + *, + scanned_at: dt.datetime, + finding_count: int, +) -> ScanLedgerEntry: + return ScanLedgerEntry( + repo_id=job.repo_id, + commit_sha=job.commit_sha, + scanner_name=job.scanner_name, + scanner_version=job.scanner_version, + rule_pack_version=job.rule_pack_version, + scanner_config_hash=job.scanner_config_hash, + scan_run_id=_scan_run_id_for_job(job), + job_id=job.job_id, + scanned_at=scanned_at, + finding_count=finding_count, + ) + + +def _finding_with_commit(finding: Finding, commit_sha: str) -> Finding: + if finding.repo.commit == commit_sha: + return finding + data = finding.to_dict() + data["repo"] = {**data["repo"], "commit": commit_sha} + return Finding.from_dict(data) + + +def _now(request: ScanWorkerRequest) -> dt.datetime: + value = request.now_factory() + if value.tzinfo is None: + return value.replace(tzinfo=dt.UTC) + return value.astimezone(dt.UTC).replace(microsecond=0) diff --git a/src/security_scanner/scanners/gitleaks/runner.py b/src/security_scanner/scanners/gitleaks/runner.py index bab544b..8b0a2cf 100644 --- a/src/security_scanner/scanners/gitleaks/runner.py +++ b/src/security_scanner/scanners/gitleaks/runner.py @@ -32,6 +32,8 @@ def build_command( """Build the gitleaks argv list.""" opts = scan_options or ScanOptions() mode = "git" if opts.include_history else "dir" + if mode == "dir" and opts.git_log_opts: + raise ValueError("git_log_opts is only supported in git scan mode") cmd: list[str] = [ self.binary, @@ -43,6 +45,8 @@ def build_command( "--exit-code", "0", ] + if opts.git_log_opts: + cmd.extend(["--log-opts", opts.git_log_opts]) if self.config_path: cmd.extend(["--config", self.config_path]) cmd.append(str(root)) diff --git a/src/security_scanner/storage/adapters/nosql_db/access.py b/src/security_scanner/storage/adapters/nosql_db/access.py index 1fd628d..ecfe278 100644 --- a/src/security_scanner/storage/adapters/nosql_db/access.py +++ b/src/security_scanner/storage/adapters/nosql_db/access.py @@ -9,10 +9,14 @@ from security_scanner.storage.adapters.nosql_db.items import ( RepoMetadata, ScanRunSummary, + repo_lease_from_item, repo_metadata_from_item, + scan_job_from_item, + scan_ledger_entry_from_item, scan_run_summary_from_item, scan_target_from_item, ) +from security_scanner.storage.base import RepoLease, ScanJob, ScanLedgerEntry def items_to_findings(items: Iterable[dict[str, Any]]) -> list[Finding]: @@ -47,6 +51,35 @@ def items_to_scan_targets(items: Iterable[dict[str, Any]]) -> list[ScanTarget]: return targets +def items_to_scan_jobs(items: Iterable[dict[str, Any]]) -> list[ScanJob]: + """Return scan job objects from table items.""" + jobs: list[ScanJob] = [] + for item in items: + if item.get("entityType") == "SCAN_JOB": + jobs.append(scan_job_from_item(item)) + return jobs + + +def items_to_scan_ledger_entries( + items: Iterable[dict[str, Any]], +) -> list[ScanLedgerEntry]: + """Return scan ledger objects from table items.""" + entries: list[ScanLedgerEntry] = [] + for item in items: + if item.get("entityType") == "SCAN_LEDGER": + entries.append(scan_ledger_entry_from_item(item)) + return entries + + +def items_to_repo_leases(items: Iterable[dict[str, Any]]) -> list[RepoLease]: + """Return repo lease objects from table items.""" + leases: list[RepoLease] = [] + for item in items: + if item.get("entityType") == "REPO_LEASE": + leases.append(repo_lease_from_item(item)) + return leases + + def items_to_scan_run_summaries( items: Iterable[dict[str, Any]], ) -> list[ScanRunSummary]: diff --git a/src/security_scanner/storage/adapters/nosql_db/items.py b/src/security_scanner/storage/adapters/nosql_db/items.py index 03fe3cf..2906122 100644 --- a/src/security_scanner/storage/adapters/nosql_db/items.py +++ b/src/security_scanner/storage/adapters/nosql_db/items.py @@ -8,15 +8,29 @@ from collections import Counter from dataclasses import dataclass, field from typing import Any, Iterable +from urllib.parse import urlsplit, urlunsplit from security_scanner.catalog.scan_target import ScanTarget from security_scanner.core.finding.model import Finding +from security_scanner.storage.base import ( + RefState, + RepoLease, + ScanJob, + ScanLedgerEntry, + ScanLedgerKey, +) from security_scanner.storage.adapters.nosql_db.transport import ( REPO_LIST_PK, TARGET_LIST_PK, ) +SCAN_JOB_STATUS_PENDING = "pending" +SCAN_JOB_STATUS_LEASED = "leased" +SCAN_JOB_STATUS_COMPLETED = "completed" +SCAN_JOB_STATUS_DEAD_LETTER = "dead_letter" + + @dataclass(frozen=True) class RepoMetadata: """Current scan-target metadata for one repository.""" @@ -53,6 +67,73 @@ def now_iso() -> str: return dt.datetime.now(dt.UTC).replace(microsecond=0).isoformat() +def ensure_utc(value: dt.datetime) -> dt.datetime: + """Return a timezone-aware UTC datetime.""" + if value.tzinfo is None: + return value.replace(tzinfo=dt.UTC) + return value.astimezone(dt.UTC) + + +def datetime_to_iso(value: dt.datetime) -> str: + """Return an ISO timestamp suitable for lexicographic keys.""" + return ensure_utc(value).replace(microsecond=0).isoformat() + + +def datetime_from_iso(value: str) -> dt.datetime: + """Parse an ISO timestamp into a timezone-aware UTC datetime.""" + parsed = dt.datetime.fromisoformat(value.replace("Z", "+00:00")) + return ensure_utc(parsed) + + +def normalize_scan_target_url(url: str) -> str: + """Normalize the scan target URL used for incremental repo identity.""" + value = url.strip() + parsed = urlsplit(value) + if parsed.scheme and parsed.netloc: + path = parsed.path.rstrip("/") + return urlunsplit( + ( + parsed.scheme.lower(), + parsed.netloc.lower(), + path, + "", + "", + ) + ) + return value.rstrip("/") + + +def repo_id_for_scan_target_url(url: str) -> str: + """Return the canonical incremental repository ID for a scan target URL.""" + normalized = normalize_scan_target_url(url) + digest = hashlib.sha256(normalized.encode("utf-8")).hexdigest()[:24] + return f"repo_{digest}" + + +def scan_job_id_for( + *, + repo_id: str, + commit_sha: str, + scanner_name: str, + scanner_version: str, + rule_pack_version: str, + scanner_config_hash: str, +) -> str: + """Return the deterministic queue job ID for one commit scanner tuple.""" + material = "\0".join( + [ + repo_id, + commit_sha, + scanner_name, + scanner_version, + rule_pack_version, + scanner_config_hash, + ] + ) + digest = hashlib.sha256(material.encode("utf-8")).hexdigest()[:32] + return f"scan_job_{digest}" + + def counts_by_category(findings: Iterable[Finding]) -> dict[str, int]: """Count findings by category value.""" return dict(Counter(finding.category for finding in findings)) @@ -170,6 +251,186 @@ def scan_target_from_item(item: dict[str, Any]) -> ScanTarget: ) +def ref_state_to_item(state: RefState) -> dict[str, Any]: + """Map a REF_STATE domain object into the NoSQL item shape.""" + updated_at = datetime_to_iso(state.updated_at) + return { + "PK": f"REPO#{state.repo_id}", + "SK": f"REF#{state.ref_name}", + "entityType": "REF_STATE", + "gsi1pk": "REF_STATE#ALL", + "gsi1sk": f"{updated_at}#{state.repo_id}#{state.ref_name}", + "repoId": state.repo_id, + "repoUrl": state.repo_url, + "refName": state.ref_name, + "lastSeenSha": state.last_seen_sha, + "updatedAt": updated_at, + } + + +def ref_state_from_item(item: dict[str, Any]) -> RefState: + """Reconstruct a REF_STATE domain object from a table item.""" + return RefState( + repo_id=item["repoId"], + repo_url=item["repoUrl"], + ref_name=item["refName"], + last_seen_sha=item["lastSeenSha"], + updated_at=datetime_from_iso(item["updatedAt"]), + ) + + +def scan_job_to_item(job: ScanJob) -> dict[str, Any]: + """Map a SCAN_JOB domain object into the NoSQL item shape.""" + created_at = datetime_to_iso(job.created_at) + updated_at = datetime_to_iso(job.updated_at) + next_attempt_at = datetime_to_iso(job.next_attempt_at) + lease_until = ( + datetime_to_iso(job.lease_until) if job.lease_until is not None else None + ) + return without_none( + { + "PK": f"SCAN_JOB#{job.job_id}", + "SK": "META", + "entityType": "SCAN_JOB", + "gsi1pk": f"SCAN_JOB_STATUS#{job.status}", + "gsi1sk": ( + f"{next_attempt_at}#{job.priority:08d}#{created_at}#{job.job_id}" + ), + "gsi2pk": f"REPO#{job.repo_id}", + "gsi2sk": f"JOB#{job.status}#{created_at}#{job.job_id}", + "jobId": job.job_id, + "repoId": job.repo_id, + "repoUrl": job.repo_url, + "refName": job.ref_name, + "oldSha": job.old_sha, + "newSha": job.new_sha, + "commitSha": job.commit_sha, + "commitRange": job.commit_range, + "scannerName": job.scanner_name, + "scannerVersion": job.scanner_version, + "rulePackVersion": job.rule_pack_version, + "scannerConfigHash": job.scanner_config_hash, + "priority": job.priority, + "status": job.status, + "attempts": job.attempts, + "maxAttempts": job.max_attempts, + "workerId": job.worker_id, + "leaseUntil": lease_until, + "nextAttemptAt": next_attempt_at, + "createdAt": created_at, + "updatedAt": updated_at, + "lastError": job.last_error, + } + ) + + +def scan_job_from_item(item: dict[str, Any]) -> ScanJob: + """Reconstruct a SCAN_JOB domain object from a table item.""" + lease_until = item.get("leaseUntil") + return ScanJob( + job_id=item["jobId"], + repo_id=item["repoId"], + repo_url=item["repoUrl"], + ref_name=item["refName"], + old_sha=item.get("oldSha"), + new_sha=item["newSha"], + commit_sha=item["commitSha"], + commit_range=item.get("commitRange"), + scanner_name=item["scannerName"], + scanner_version=item["scannerVersion"], + rule_pack_version=item["rulePackVersion"], + scanner_config_hash=item["scannerConfigHash"], + priority=int(item["priority"]), + status=item["status"], + attempts=int(item.get("attempts", 0)), + max_attempts=int(item.get("maxAttempts", 3)), + worker_id=item.get("workerId"), + lease_until=datetime_from_iso(lease_until) if lease_until else None, + next_attempt_at=datetime_from_iso(item["nextAttemptAt"]), + created_at=datetime_from_iso(item["createdAt"]), + updated_at=datetime_from_iso(item["updatedAt"]), + last_error=item.get("lastError"), + ) + + +def scan_ledger_key_to_key(key: ScanLedgerKey) -> dict[str, str]: + """Return the table key for a SCAN_LEDGER row.""" + return { + "PK": f"SCAN_LEDGER#{key.repo_id}#{key.commit_sha}", + "SK": ( + f"{key.scanner_name}#{key.scanner_version}#" + f"{key.rule_pack_version}#{key.scanner_config_hash}" + ), + } + + +def scan_ledger_entry_to_item(entry: ScanLedgerEntry) -> dict[str, Any]: + """Map a SCAN_LEDGER domain object into the NoSQL item shape.""" + scanned_at = datetime_to_iso(entry.scanned_at) + return { + **scan_ledger_key_to_key(entry.key), + "entityType": "SCAN_LEDGER", + "gsi1pk": f"REPO#{entry.repo_id}", + "gsi1sk": ( + f"LEDGER#{entry.commit_sha}#{entry.scanner_name}#" + f"{entry.rule_pack_version}" + ), + "repoId": entry.repo_id, + "commitSha": entry.commit_sha, + "scannerName": entry.scanner_name, + "scannerVersion": entry.scanner_version, + "rulePackVersion": entry.rule_pack_version, + "scannerConfigHash": entry.scanner_config_hash, + "scanRunId": entry.scan_run_id, + "jobId": entry.job_id, + "scannedAt": scanned_at, + "findingCount": entry.finding_count, + } + + +def scan_ledger_entry_from_item(item: dict[str, Any]) -> ScanLedgerEntry: + """Reconstruct a SCAN_LEDGER domain object from a table item.""" + return ScanLedgerEntry( + repo_id=item["repoId"], + commit_sha=item["commitSha"], + scanner_name=item["scannerName"], + scanner_version=item["scannerVersion"], + rule_pack_version=item["rulePackVersion"], + scanner_config_hash=item["scannerConfigHash"], + scan_run_id=item["scanRunId"], + job_id=item["jobId"], + scanned_at=datetime_from_iso(item["scannedAt"]), + finding_count=int(item.get("findingCount", 0)), + ) + + +def repo_lease_to_item(lease: RepoLease) -> dict[str, Any]: + """Map a REPO_LEASE domain object into the NoSQL item shape.""" + lease_until = datetime_to_iso(lease.lease_until) + updated_at = datetime_to_iso(lease.updated_at) + return { + "PK": f"REPO_LEASE#{lease.repo_id}", + "SK": "META", + "entityType": "REPO_LEASE", + "gsi1pk": "REPO_LEASE#ALL", + "gsi1sk": f"{lease_until}#{lease.repo_id}", + "repoId": lease.repo_id, + "workerId": lease.worker_id, + "leaseUntil": lease_until, + "updatedAt": updated_at, + } + + +def repo_lease_from_item(item: dict[str, Any]) -> RepoLease: + """Reconstruct a REPO_LEASE domain object from a table item.""" + return RepoLease( + repo_id=item["repoId"], + worker_id=item["workerId"], + lease_until=datetime_from_iso(item["leaseUntil"]), + updated_at=datetime_from_iso(item["updatedAt"]), + ) + + STATE_SCOPE_GLOBAL = "GLOBAL" diff --git a/src/security_scanner/storage/adapters/nosql_db/store.py b/src/security_scanner/storage/adapters/nosql_db/store.py index 0c4a68d..7faaa5d 100644 --- a/src/security_scanner/storage/adapters/nosql_db/store.py +++ b/src/security_scanner/storage/adapters/nosql_db/store.py @@ -2,14 +2,19 @@ from __future__ import annotations -from typing import Any, Iterable +import datetime as dt +from collections import Counter +from dataclasses import replace +from typing import Any, Iterable, Sequence from security_scanner.catalog.scan_target import ScanTarget from security_scanner.core.finding.model import Finding from security_scanner.storage.adapters.nosql_db.access import ( items_to_finding_state_by_id, items_to_findings, + items_to_repo_leases, items_to_repo_metadata, + items_to_scan_jobs, items_to_scan_run_summaries, items_to_scan_targets, merge_finding_states, @@ -18,11 +23,23 @@ ) from security_scanner.storage.adapters.nosql_db.items import ( RepoMetadata, + SCAN_JOB_STATUS_COMPLETED, + SCAN_JOB_STATUS_DEAD_LETTER, + SCAN_JOB_STATUS_LEASED, + SCAN_JOB_STATUS_PENDING, STATE_SCOPE_GLOBAL, ScanRunSummary, counts_by_category, + datetime_to_iso, finding_to_items, + ref_state_from_item, + ref_state_to_item, + repo_lease_to_item, repo_metadata_to_item, + scan_job_from_item, + scan_job_to_item, + scan_ledger_entry_to_item, + scan_ledger_key_to_key, scan_run_summary_to_item, scan_target_to_item, split_target_name, @@ -35,7 +52,15 @@ build_table_schema, make_boto3_resource_and_client, ) -from security_scanner.storage.base import TargetScanResult +from security_scanner.storage.base import ( + QueueStatus, + RefState, + RepoLease, + ScanJob, + ScanLedgerEntry, + ScanLedgerKey, + TargetScanResult, +) class DynamoDbCompatibleFindingStore: @@ -115,6 +140,231 @@ def list_scan_targets(self) -> list[ScanTarget]: def delete_scan_target(self, url: str) -> None: self._table.delete_item(Key={"PK": f"SCAN_TARGET#{url}", "SK": "META"}) + def get_ref_state(self, repo_id: str, ref_name: str) -> RefState | None: + response = self._table.get_item( + Key={"PK": f"REPO#{repo_id}", "SK": f"REF#{ref_name}"} + ) + item = response.get("Item") + if not item or item.get("entityType") != "REF_STATE": + return None + return ref_state_from_item(item) + + def put_ref_state(self, state: RefState) -> None: + self._table.put_item(Item=ref_state_to_item(state)) + + def has_scan_ledger(self, key: ScanLedgerKey) -> bool: + response = self._table.get_item(Key=scan_ledger_key_to_key(key)) + item = response.get("Item") + return bool(item and item.get("entityType") == "SCAN_LEDGER") + + def enqueue_commit_scan_job(self, job: ScanJob) -> bool: + if self.has_scan_ledger(job.ledger_key): + return False + try: + self._table.put_item( + Item=scan_job_to_item(job), + ConditionExpression=( + "attribute_not_exists(PK) AND attribute_not_exists(SK)" + ), + ) + return True + except Exception as exc: + if _is_conditional_check_failure(exc): + return False + raise + + def lease_next_scan_job( + self, + worker_id: str, + lease_seconds: int, + now: dt.datetime, + ) -> ScanJob | None: + now = _ensure_utc(now) + candidates = [ + *self._read_scan_jobs_by_status(SCAN_JOB_STATUS_PENDING), + *self._read_scan_jobs_by_status(SCAN_JOB_STATUS_LEASED), + ] + candidates.sort(key=_scan_job_lease_sort_key) + for job in candidates: + if not _scan_job_is_lease_eligible(job, now): + continue + leased = self._try_lease_scan_job( + job=job, + worker_id=worker_id, + lease_seconds=lease_seconds, + now=now, + ) + if leased is not None: + return leased + return None + + def complete_processed_job( + self, + job: ScanJob, + findings: Sequence[Finding], + ledger: ScanLedgerEntry, + ) -> None: + if not self.has_scan_ledger(ledger.key): + self.extend(findings) + self._put_scan_ledger_if_absent(ledger) + + latest_job = self._get_scan_job(job.job_id) or job + completed_job = replace( + latest_job, + status=SCAN_JOB_STATUS_COMPLETED, + worker_id=None, + lease_until=None, + updated_at=ledger.scanned_at, + last_error=None, + ) + try: + self._table.put_item( + Item=scan_job_to_item(completed_job), + ConditionExpression=( + "attribute_exists(PK) AND attribute_exists(SK) AND " + "(#status = :leased OR #status = :completed)" + ), + ExpressionAttributeNames={"#status": "status"}, + ExpressionAttributeValues={ + ":leased": SCAN_JOB_STATUS_LEASED, + ":completed": SCAN_JOB_STATUS_COMPLETED, + }, + ) + except Exception as exc: + if _is_conditional_check_failure(exc): + return + raise + + def record_retryable_failure( + self, + job_id: str, + error: str, + next_attempt_at: dt.datetime, + ) -> None: + job = self._get_scan_job(job_id) + if job is None: + return + attempts = job.attempts + 1 + now = _now() + if attempts >= job.max_attempts: + updated = replace( + job, + status=SCAN_JOB_STATUS_DEAD_LETTER, + attempts=attempts, + worker_id=None, + lease_until=None, + updated_at=now, + last_error=error, + ) + else: + updated = replace( + job, + status=SCAN_JOB_STATUS_PENDING, + attempts=attempts, + worker_id=None, + lease_until=None, + next_attempt_at=_ensure_utc(next_attempt_at), + updated_at=now, + last_error=error, + ) + self._put_existing_scan_job(updated) + + def move_job_to_dead_letter(self, job_id: str, error: str) -> None: + job = self._get_scan_job(job_id) + if job is None: + return + updated = replace( + job, + status=SCAN_JOB_STATUS_DEAD_LETTER, + worker_id=None, + lease_until=None, + updated_at=_now(), + last_error=error, + ) + self._put_existing_scan_job(updated) + + def return_job_to_pending(self, job_id: str, reason: str) -> None: + job = self._get_scan_job(job_id) + if job is None: + return + now = _now() + updated = replace( + job, + status=SCAN_JOB_STATUS_PENDING, + worker_id=None, + lease_until=None, + next_attempt_at=now, + updated_at=now, + last_error=reason, + ) + self._put_existing_scan_job(updated) + + def acquire_repo_lease( + self, + repo_id: str, + worker_id: str, + lease_seconds: int, + ) -> bool: + now = _now() + lease = RepoLease( + repo_id=repo_id, + worker_id=worker_id, + lease_until=now + dt.timedelta(seconds=lease_seconds), + updated_at=now, + ) + try: + self._table.put_item( + Item=repo_lease_to_item(lease), + ConditionExpression="attribute_not_exists(PK) OR leaseUntil <= :now", + ExpressionAttributeValues={":now": datetime_to_iso(now)}, + ) + return True + except Exception as exc: + if _is_conditional_check_failure(exc): + return False + raise + + def release_repo_lease(self, repo_id: str, worker_id: str) -> None: + try: + self._table.delete_item( + Key={"PK": f"REPO_LEASE#{repo_id}", "SK": "META"}, + ConditionExpression="workerId = :worker_id", + ExpressionAttributeValues={":worker_id": worker_id}, + ) + except Exception as exc: + if _is_conditional_check_failure(exc): + return + raise + + def get_queue_status(self, now: dt.datetime) -> QueueStatus: + now = _ensure_utc(now) + job_items = scan_all_pages( + self._table, + FilterExpression="entityType = :entity_type", + ExpressionAttributeValues={":entity_type": "SCAN_JOB"}, + ) + jobs = items_to_scan_jobs(job_items) + status_counts = Counter(job.status for job in jobs) + expired_job_leases = sum( + 1 + for job in jobs + if job.status == SCAN_JOB_STATUS_LEASED + and job.lease_until is not None + and job.lease_until <= now + ) + lease_items = scan_all_pages( + self._table, + FilterExpression="entityType = :entity_type", + ExpressionAttributeValues={":entity_type": "REPO_LEASE"}, + ) + repo_leases = items_to_repo_leases(lease_items) + expired_repo_leases = sum(1 for lease in repo_leases if lease.lease_until <= now) + return QueueStatus( + job_counts_by_status=dict(status_counts), + expired_job_leases=expired_job_leases, + expired_repo_leases=expired_repo_leases, + ) + def write_scan_result(self, result: TargetScanResult) -> None: findings = list(result.findings) self.extend(findings) @@ -242,6 +492,83 @@ def _put_state_item_if_absent(self, item: dict[str, Any]) -> None: return raise + def _get_scan_job(self, job_id: str) -> ScanJob | None: + response = self._table.get_item(Key={"PK": f"SCAN_JOB#{job_id}", "SK": "META"}) + item = response.get("Item") + if not item or item.get("entityType") != "SCAN_JOB": + return None + return scan_job_from_item(item) + + def _put_existing_scan_job(self, job: ScanJob) -> None: + try: + self._table.put_item( + Item=scan_job_to_item(job), + ConditionExpression="attribute_exists(PK) AND attribute_exists(SK)", + ) + except Exception as exc: + if _is_conditional_check_failure(exc): + return + raise + + def _put_scan_ledger_if_absent(self, ledger: ScanLedgerEntry) -> None: + try: + self._table.put_item( + Item=scan_ledger_entry_to_item(ledger), + ConditionExpression=( + "attribute_not_exists(PK) AND attribute_not_exists(SK)" + ), + ) + except Exception as exc: + if _is_conditional_check_failure(exc): + return + raise + + def _read_scan_jobs_by_status(self, status: str) -> list[ScanJob]: + items = query_all_pages( + self._table, + IndexName=GSI1_NAME, + KeyConditionExpression="gsi1pk = :pk", + ExpressionAttributeValues={ + ":pk": f"SCAN_JOB_STATUS#{status}", + }, + ) + return items_to_scan_jobs(items) + + def _try_lease_scan_job( + self, + *, + job: ScanJob, + worker_id: str, + lease_seconds: int, + now: dt.datetime, + ) -> ScanJob | None: + leased = replace( + job, + status=SCAN_JOB_STATUS_LEASED, + worker_id=worker_id, + lease_until=now + dt.timedelta(seconds=lease_seconds), + updated_at=now, + ) + try: + self._table.put_item( + Item=scan_job_to_item(leased), + ConditionExpression=( + "(#status = :pending AND nextAttemptAt <= :now) OR " + "(#status = :leased AND leaseUntil <= :now)" + ), + ExpressionAttributeNames={"#status": "status"}, + ExpressionAttributeValues={ + ":pending": SCAN_JOB_STATUS_PENDING, + ":leased": SCAN_JOB_STATUS_LEASED, + ":now": datetime_to_iso(now), + }, + ) + return leased + except Exception as exc: + if _is_conditional_check_failure(exc): + return None + raise + def _batch_read_finding_states( self, finding_ids: Iterable[str], @@ -278,3 +605,35 @@ def _is_conditional_check_failure(exc: Exception) -> bool: if error.get("Code") == "ConditionalCheckFailedException": return True return exc.__class__.__name__ == "ConditionalCheckFailedException" + + +def _ensure_utc(value: dt.datetime) -> dt.datetime: + """Return a timezone-aware UTC datetime.""" + if value.tzinfo is None: + return value.replace(tzinfo=dt.UTC) + return value.astimezone(dt.UTC) + + +def _now() -> dt.datetime: + """Return current UTC time without microseconds.""" + return dt.datetime.now(dt.UTC).replace(microsecond=0) + + +def _scan_job_is_lease_eligible(job: ScanJob, now: dt.datetime) -> bool: + """Return whether a job can be leased at this instant.""" + if job.status == SCAN_JOB_STATUS_PENDING: + return job.next_attempt_at <= now + if job.status == SCAN_JOB_STATUS_LEASED and job.lease_until is not None: + return job.lease_until <= now + return False + + +def _scan_job_lease_sort_key(job: ScanJob) -> tuple[dt.datetime, int, dt.datetime, str]: + """Sort eligible jobs by the timestamp that made them available.""" + available_at = job.lease_until if job.status == SCAN_JOB_STATUS_LEASED else None + return ( + available_at or job.next_attempt_at, + job.priority, + job.created_at, + job.job_id, + ) diff --git a/src/security_scanner/storage/base.py b/src/security_scanner/storage/base.py index d595c62..edb05b8 100644 --- a/src/security_scanner/storage/base.py +++ b/src/security_scanner/storage/base.py @@ -2,10 +2,12 @@ from __future__ import annotations +import datetime as dt from dataclasses import dataclass from pathlib import Path from typing import Iterable, Protocol, Sequence, runtime_checkable +from security_scanner.catalog.scan_target import ScanTarget from security_scanner.core.finding.model import Finding @@ -21,6 +23,116 @@ class TargetScanResult: scan_at_iso: str +@dataclass(frozen=True) +class RefState: + """Last observed commit for one repository ref.""" + + repo_id: str + repo_url: str + ref_name: str + last_seen_sha: str + updated_at: dt.datetime + + +@dataclass(frozen=True) +class ScanLedgerKey: + """Commit scanner tuple that defines incremental scan completion.""" + + repo_id: str + commit_sha: str + scanner_name: str + scanner_version: str + rule_pack_version: str + scanner_config_hash: str + + +@dataclass(frozen=True) +class ScanLedgerEntry: + """Durable proof that one commit was scanned by one scanner tuple.""" + + repo_id: str + commit_sha: str + scanner_name: str + scanner_version: str + rule_pack_version: str + scanner_config_hash: str + scan_run_id: str + job_id: str + scanned_at: dt.datetime + finding_count: int + + @property + def key(self) -> ScanLedgerKey: + """Return the ledger identity fields.""" + return ScanLedgerKey( + repo_id=self.repo_id, + commit_sha=self.commit_sha, + scanner_name=self.scanner_name, + scanner_version=self.scanner_version, + rule_pack_version=self.rule_pack_version, + scanner_config_hash=self.scanner_config_hash, + ) + + +@dataclass(frozen=True) +class ScanJob: + """Durable queue item for one commit scan.""" + + job_id: str + repo_id: str + repo_url: str + ref_name: str + old_sha: str | None + new_sha: str + commit_sha: str + commit_range: str | None + scanner_name: str + scanner_version: str + rule_pack_version: str + scanner_config_hash: str + priority: int + status: str + attempts: int + max_attempts: int + worker_id: str | None + lease_until: dt.datetime | None + next_attempt_at: dt.datetime + created_at: dt.datetime + updated_at: dt.datetime + last_error: str | None = None + + @property + def ledger_key(self) -> ScanLedgerKey: + """Return the commit scanner tuple this job should complete.""" + return ScanLedgerKey( + repo_id=self.repo_id, + commit_sha=self.commit_sha, + scanner_name=self.scanner_name, + scanner_version=self.scanner_version, + rule_pack_version=self.rule_pack_version, + scanner_config_hash=self.scanner_config_hash, + ) + + +@dataclass(frozen=True) +class RepoLease: + """Bounded lease that protects one repository workspace.""" + + repo_id: str + worker_id: str + lease_until: dt.datetime + updated_at: dt.datetime + + +@dataclass(frozen=True) +class QueueStatus: + """Status counts for incremental queue visibility.""" + + job_counts_by_status: dict[str, int] + expired_job_leases: int + expired_repo_leases: int + + @runtime_checkable class FindingReader(Protocol): """Read-only finding access expected by report/gate/evaluation callers.""" @@ -67,3 +179,67 @@ def extend(self, findings: Iterable[Finding]) -> None: def clear(self) -> None: """Clear the store when the backend supports destructive reset.""" + + +@runtime_checkable +class IncrementalScanStore(Protocol): + """Durable queue/ledger storage capability for incremental scanning.""" + + def list_scan_targets(self) -> list[ScanTarget]: + """Return configured scan targets.""" + + def get_ref_state(self, repo_id: str, ref_name: str) -> RefState | None: + """Return the last observed state for a repository ref.""" + + def put_ref_state(self, state: RefState) -> None: + """Persist the last observed state for a repository ref.""" + + def has_scan_ledger(self, key: ScanLedgerKey) -> bool: + """Return whether a commit scanner tuple is already completed.""" + + def enqueue_commit_scan_job(self, job: ScanJob) -> bool: + """Create a commit scan job, returning False for clean idempotent skips.""" + + def lease_next_scan_job( + self, + worker_id: str, + lease_seconds: int, + now: dt.datetime, + ) -> ScanJob | None: + """Lease the next eligible pending or expired job.""" + + def complete_processed_job( + self, + job: ScanJob, + findings: Sequence[Finding], + ledger: ScanLedgerEntry, + ) -> None: + """Persist findings, ledger, then mark the job completed.""" + + def record_retryable_failure( + self, + job_id: str, + error: str, + next_attempt_at: dt.datetime, + ) -> None: + """Return a failed job to pending or dead-letter it when attempts exhaust.""" + + def move_job_to_dead_letter(self, job_id: str, error: str) -> None: + """Move a job to the terminal failure state.""" + + def return_job_to_pending(self, job_id: str, reason: str) -> None: + """Return a leased job to pending without incrementing attempts.""" + + def acquire_repo_lease( + self, + repo_id: str, + worker_id: str, + lease_seconds: int, + ) -> bool: + """Acquire a bounded repository lease when absent or expired.""" + + def release_repo_lease(self, repo_id: str, worker_id: str) -> None: + """Release a repository lease only when owned by the worker.""" + + def get_queue_status(self, now: dt.datetime) -> QueueStatus: + """Return queue status counts and expired lease counts.""" diff --git a/src/security_scanner/targets/fetcher.py b/src/security_scanner/targets/fetcher.py index b2d10cc..565f47d 100644 --- a/src/security_scanner/targets/fetcher.py +++ b/src/security_scanner/targets/fetcher.py @@ -1,18 +1,24 @@ """SCM fetcher dispatch for the SCAN_TARGET catalog (spec §6). -Dispatches a normalized repository URL to ``gh`` or ``glab`` to clone, or to -``git fetch`` when the cache path already exists. Authentication is inherited -from the parent process environment; this module never reads or forwards -tokens explicitly. +Dispatches a normalized repository URL to ``gh`` or ``glab`` to clone, then +falls back to unauthenticated HTTPS ``git clone`` for public repositories. When +the cache path already exists, it uses ``git fetch``. Authentication is inherited +from the parent process environment; this module never reads or forwards tokens +explicitly. """ from __future__ import annotations +import os import subprocess from pathlib import Path +from typing import Literal from urllib.parse import urlsplit +ScmProvider = Literal["auto", "github", "gitlab"] + + # --------------------------------------------------------------------------- # Exceptions # --------------------------------------------------------------------------- @@ -40,45 +46,98 @@ def _default_cache_root() -> Path: return Path.home() / ".cache" / "security-scanner" / "repos" -def _dispatch_tool(host: str) -> str: +def resolve_scm_provider(host: str, scm_provider: ScmProvider = "auto") -> str: + if scm_provider == "github": + return "github" + if scm_provider == "gitlab": + return "gitlab" + if scm_provider != "auto": + raise UnsupportedHostError(f"unsupported SCM provider: {scm_provider}") + if host == "github.com": + return "github" + if host == "gitlab.com" or "gitlab" in host.split("."): + return "gitlab" + raise UnsupportedHostError( + f"unsupported SCM host: {host}; pass --scm-provider for custom domains" + ) + + +def _dispatch_tool(provider: str) -> str: + if provider == "github": return "gh" - if host == "gitlab.com" or host.startswith("gitlab."): + if provider == "gitlab": return "glab" - raise UnsupportedHostError(f"unsupported SCM host: {host}") + raise UnsupportedHostError(f"unsupported SCM provider: {provider}") -def _parse_owner_repo(url: str) -> tuple[str, str, str]: +def parse_owner_repo(url: str) -> tuple[str, str, str]: + """Parse a normalized SCM URL into host, owner path, and repository name.""" parts = urlsplit(url) host = parts.hostname or "" - + path = parts.path if path.endswith(".git"): path = path[:-4] - + path_segments = [seg for seg in path.split("/") if seg] - + if len(path_segments) < 2: raise UnsupportedHostError( f"URL path does not match /owner/[subgroups]/repo shape: {url}" ) - + if host == "github.com" and len(path_segments) > 2: raise UnsupportedHostError( f"GitHub URLs must have exactly 2 path segments (owner/repo): {url}" ) - + owner = "/".join(path_segments[:-1]) repo = path_segments[-1] return host, owner, repo +def _clone_commands( + provider: str, + host: str, + owner: str, + repo: str, + url: str, + cache_path: Path, +) -> list[tuple[list[str], dict[str, str] | None]]: + repo_ref = _repo_ref(provider, host, owner, repo, url) + primary = [_dispatch_tool(provider), "repo", "clone", repo_ref, str(cache_path)] + fallback = ["git", "clone", url, str(cache_path)] + return [(primary, _provider_env(provider, host, url)), (fallback, None)] + + +def _repo_ref(provider: str, host: str, owner: str, repo: str, url: str) -> str: + if provider == "gitlab" and host != "gitlab.com": + return url + return f"{owner}/{repo}" + + +def _provider_env(provider: str, host: str, url: str) -> dict[str, str] | None: + if provider != "gitlab" or host == "gitlab.com": + return None + parts = urlsplit(url) + env = os.environ.copy() + env["GITLAB_HOST"] = parts.netloc + return env + + # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- -def fetch_or_clone(url: str, cache_root: Path | None = None) -> Path: +def fetch_or_clone( + url: str, + cache_root: Path | None = None, + *, + allow_git_fallback: bool = True, + scm_provider: ScmProvider = "auto", +) -> Path: """Clone the repo to the cache or fetch updates if already cloned. Parameters @@ -88,6 +147,13 @@ def fetch_or_clone(url: str, cache_root: Path | None = None) -> Path: cache_root: Override the cache root directory. Defaults to ``~/.cache/security-scanner/repos``. + allow_git_fallback: + When True, a failed ``gh``/``glab`` clone attempt falls back to + unauthenticated HTTPS ``git clone``. Set False after a private-auth + preflight when falling back would only hide the intended auth failure. + scm_provider: + Provider hint for custom domains. ``auto`` infers GitHub.com and common + GitLab host names; pass ``gitlab`` for arbitrary GitLab domains. Returns ------- @@ -97,31 +163,48 @@ def fetch_or_clone(url: str, cache_root: Path | None = None) -> Path: Raises ------ UnsupportedHostError - If the host is not ``github.com`` / ``gitlab.*`` or the URL path - does not look like ``/owner/repo``. + If the host/provider combination is unsupported or the URL path does + not look like ``/owner/repo``. FetchError If the dispatched subprocess fails or its binary is missing. """ - host, owner, repo = _parse_owner_repo(url) - tool = _dispatch_tool(host) + host, owner, repo = parse_owner_repo(url) + provider = resolve_scm_provider(host, scm_provider) root = cache_root if cache_root is not None else _default_cache_root() - cache_path = root / owner / repo + cache_path = root / host / owner / repo if (cache_path / ".git").exists(): - cmd = ["git", "-C", str(cache_path), "fetch", "--all", "--prune"] + commands = [(["git", "-C", str(cache_path), "fetch", "--all", "--prune"], None)] else: cache_path.parent.mkdir(parents=True, exist_ok=True) - cmd = [tool, "repo", "clone", f"{owner}/{repo}", str(cache_path)] - - try: - subprocess.run(cmd, check=True, capture_output=True, text=True) - except FileNotFoundError as exc: - raise FetchError(f"{cmd[0]} binary not found on PATH") from exc - except subprocess.CalledProcessError as exc: - detail = (exc.stderr or exc.stdout or "no process output").strip() - raise FetchError( - f"{cmd[0]} failed with exit code {exc.returncode}: {detail}" - ) from exc - - return cache_path + commands = _clone_commands(provider, host, owner, repo, url, cache_path) + if not allow_git_fallback: + commands = commands[:1] + + errors: list[str] = [] + last_exc: Exception | None = None + for cmd, env in commands: + try: + kwargs = {"check": True, "capture_output": True, "text": True} + if env is not None: + kwargs["env"] = env + subprocess.run(cmd, **kwargs) + return cache_path + except FileNotFoundError as exc: + errors.append(f"{cmd[0]} binary not found on PATH") + last_exc = exc + except subprocess.CalledProcessError as exc: + detail = (exc.stderr or exc.stdout or "no process output").strip() + errors.append(f"{cmd[0]} failed with exit code {exc.returncode}: {detail}") + last_exc = exc + + message = "; fallback failed: ".join(errors) + if last_exc is not None: + raise FetchError(message) from last_exc + raise FetchError("no clone command attempted") + + +def _parse_owner_repo(url: str) -> tuple[str, str, str]: + """Backward-compatible private alias for tests or older imports.""" + return parse_owner_repo(url) diff --git a/src/security_scanner/targets/manifest.py b/src/security_scanner/targets/manifest.py index 0090068..43baa7c 100644 --- a/src/security_scanner/targets/manifest.py +++ b/src/security_scanner/targets/manifest.py @@ -111,12 +111,17 @@ def _parse_scan(raw: Any) -> ScanOptions: include_history = raw.get("include_history", True) exclude = raw.get("exclude", []) + git_log_opts = raw.get("git_log_opts") enable_noise_filter = raw.get("enable_noise_filter", True) if not isinstance(include_history, bool): raise ManifestError( f"scan.include_history must be a boolean, got {include_history!r}" ) + if git_log_opts is not None and not isinstance(git_log_opts, str): + raise ManifestError( + f"scan.git_log_opts must be a string, got {git_log_opts!r}" + ) if not isinstance(enable_noise_filter, bool): raise ManifestError( "scan.enable_noise_filter must be a boolean, " @@ -135,6 +140,7 @@ def _parse_scan(raw: Any) -> ScanOptions: return ScanOptions( include_history=include_history, exclude=list(exclude), + git_log_opts=git_log_opts, enable_noise_filter=enable_noise_filter, ) diff --git a/tests/test_cli_discover_updates.py b/tests/test_cli_discover_updates.py new file mode 100644 index 0000000..adf8974 --- /dev/null +++ b/tests/test_cli_discover_updates.py @@ -0,0 +1,232 @@ +"""CLI tests for discover-updates.""" + +from __future__ import annotations + +import datetime as dt +from pathlib import Path + +import pytest + +from security_scanner.catalog.scan_target import ScanTarget +from security_scanner.cli import main +from security_scanner.runtime.incremental_discovery import GitRef +from security_scanner.storage.adapters.nosql_db.items import repo_id_for_scan_target_url +from security_scanner.storage.base import RefState, ScanJob, ScanLedgerKey + + +NOW = dt.datetime(2026, 6, 12, 11, 0, tzinfo=dt.UTC) +TARGET = ScanTarget( + url="https://github.com/example-org/example-repo", + name="example-org/example-repo", + enabled=True, +) +REF_MAIN = "refs/remotes/origin/main" +OLD_SHA = "1" * 40 +NEW_SHA = "2" * 40 + + +class FakeStore: + bootstrap_required = True + + def __init__(self, targets: list[ScanTarget]) -> None: + self.targets = targets + self.ref_states: dict[tuple[str, str], RefState] = {} + self.jobs: dict[str, ScanJob] = {} + self.ledger: set[ScanLedgerKey] = set() + + def list_scan_targets(self) -> list[ScanTarget]: + return list(self.targets) + + def get_ref_state(self, repo_id: str, ref_name: str) -> RefState | None: + return self.ref_states.get((repo_id, ref_name)) + + def put_ref_state(self, state: RefState) -> None: + self.ref_states[(state.repo_id, state.ref_name)] = state + + def has_scan_ledger(self, key: ScanLedgerKey) -> bool: + return key in self.ledger + + def enqueue_commit_scan_job(self, job: ScanJob) -> bool: + if job.job_id in self.jobs: + return False + self.jobs[job.job_id] = job + return True + + +class FakeGit: + def __init__(self, repo_path: Path, refs: list[GitRef]) -> None: + self.repo_path = repo_path + self.refs = refs + self.ancestor = True + self.commits = [NEW_SHA] + + def fetch(self, repo_path: Path) -> None: + return None + + def list_remote_refs(self, repo_path: Path, patterns) -> list[GitRef]: + assert repo_path == self.repo_path + return self.refs + + def is_ancestor(self, repo_path: Path, old_sha: str, new_sha: str) -> bool: + assert (repo_path, old_sha, new_sha) == (self.repo_path, OLD_SHA, NEW_SHA) + return self.ancestor + + def list_new_commits(self, repo_path: Path, old_sha: str, new_sha: str) -> list[str]: + assert (repo_path, old_sha, new_sha) == (self.repo_path, OLD_SHA, NEW_SHA) + return list(self.commits) + + +@pytest.fixture +def cli_discovery(monkeypatch, tmp_path): + store = FakeStore([TARGET]) + repo_path = tmp_path / "example-repo" + git = FakeGit(repo_path, [GitRef(ref_name=REF_MAIN, commit_sha=NEW_SHA)]) + + monkeypatch.setattr( + "security_scanner.cli.app.create_finding_store", + lambda backend, **kwargs: store, + ) + monkeypatch.setattr( + "security_scanner.cli.app.fetch_or_clone", + lambda url: repo_path, + ) + monkeypatch.setattr( + "security_scanner.cli.app.SubprocessGitDiscovery", + lambda: git, + ) + return store, repo_path, git + + +def test_discover_updates_initialize_writes_ref_state_only(cli_discovery, capsys): + store, _, _ = cli_discovery + + exit_code = main( + ["discover-updates", "--initialize", "--storage-backend", "dynamodb"] + ) + + captured = capsys.readouterr() + repo_id = repo_id_for_scan_target_url(TARGET.url) + assert exit_code == 0 + assert "targets: 1" in captured.out + assert "jobs enqueued: 0" in captured.out + assert store.jobs == {} + assert store.ref_states[(repo_id, REF_MAIN)].last_seen_sha == NEW_SHA + + +def test_discover_updates_defaults_to_dynamodb_storage(cli_discovery, capsys): + store, _, _ = cli_discovery + + exit_code = main(["discover-updates", "--initialize"]) + + captured = capsys.readouterr() + repo_id = repo_id_for_scan_target_url(TARGET.url) + assert exit_code == 0 + assert "targets: 1" in captured.out + assert store.ref_states[(repo_id, REF_MAIN)].last_seen_sha == NEW_SHA + + +def test_discover_updates_enqueue_creates_job_and_advances_ref(cli_discovery, capsys): + store, _, _ = cli_discovery + repo_id = repo_id_for_scan_target_url(TARGET.url) + store.put_ref_state( + RefState( + repo_id=repo_id, + repo_url=TARGET.url, + ref_name=REF_MAIN, + last_seen_sha=OLD_SHA, + updated_at=NOW, + ) + ) + + exit_code = main(["discover-updates", "--enqueue", "--storage-backend", "dynamodb"]) + + captured = capsys.readouterr() + assert exit_code == 0 + assert "jobs enqueued: 1" in captured.out + assert [job.commit_sha for job in store.jobs.values()] == [NEW_SHA] + assert store.ref_states[(repo_id, REF_MAIN)].last_seen_sha == NEW_SHA + + +def test_discover_updates_fetch_failure_exits_two(monkeypatch, tmp_path, capsys): + store = FakeStore( + [ + TARGET, + ScanTarget( + url="https://github.com/example-org/failing-repo", + name="example-org/failing-repo", + enabled=True, + ), + ] + ) + repo_path = tmp_path / "example-repo" + git = FakeGit(repo_path, [GitRef(ref_name=REF_MAIN, commit_sha=NEW_SHA)]) + monkeypatch.setattr( + "security_scanner.cli.app.create_finding_store", + lambda backend, **kwargs: store, + ) + + def fetch_or_fail(url: str) -> Path: + if url.endswith("/failing-repo"): + raise RuntimeError("synthetic fetch failure") + return repo_path + + monkeypatch.setattr("security_scanner.cli.app.fetch_or_clone", fetch_or_fail) + monkeypatch.setattr("security_scanner.cli.app.SubprocessGitDiscovery", lambda: git) + + exit_code = main( + ["discover-updates", "--initialize", "--storage-backend", "dynamodb"] + ) + + captured = capsys.readouterr() + assert exit_code == 2 + assert "fetch failed: 1" in captured.out + assert "synthetic fetch failure" in captured.out + + +def test_discover_updates_rejects_jsonl_backend(capsys): + exit_code = main(["discover-updates", "--initialize", "--storage-backend", "jsonl"]) + + captured = capsys.readouterr() + assert exit_code == 2 + assert "dynamodb only" in captured.err + + +def test_discover_updates_requires_exactly_one_mode(): + with pytest.raises(SystemExit) as exc_info: + main(["discover-updates", "--storage-backend", "dynamodb"]) + + assert exc_info.value.code == 2 + + +def test_discover_updates_rejects_both_modes(): + with pytest.raises(SystemExit) as exc_info: + main( + [ + "discover-updates", + "--initialize", + "--enqueue", + "--storage-backend", + "dynamodb", + ] + ) + + assert exc_info.value.code == 2 + + +def test_discover_updates_storage_failure_exits_one(monkeypatch, capsys): + class BrokenStore: + def list_scan_targets(self): + raise RuntimeError("synthetic storage failure") + + monkeypatch.setattr( + "security_scanner.cli.app.create_finding_store", + lambda backend, **kwargs: BrokenStore(), + ) + + exit_code = main( + ["discover-updates", "--initialize", "--storage-backend", "dynamodb"] + ) + + captured = capsys.readouterr() + assert exit_code == 1 + assert "synthetic storage failure" in captured.err diff --git a/tests/test_cli_quickstart.py b/tests/test_cli_quickstart.py new file mode 100644 index 0000000..1142728 --- /dev/null +++ b/tests/test_cli_quickstart.py @@ -0,0 +1,127 @@ +"""CLI tests for quickstart.""" + +from __future__ import annotations + +from pathlib import Path + +from security_scanner.cli import main +from security_scanner.runtime.quickstart import QuickstartSummary +from security_scanner.runtime.incremental_discovery import IncrementalDiscoverySummary +from security_scanner.runtime.scan_worker import ScanWorkerSummary +from security_scanner.storage.base import QueueStatus + + +TARGET_URL = "https://github.com/example-org/example-repo" +CUSTOM_GITLAB_URL = "https://source.example.test/example-group/example-repo" + + +def test_quickstart_cli_runs_runtime_with_dynamodb_defaults(monkeypatch, capsys): + calls = {} + + monkeypatch.setattr( + "security_scanner.cli.app.run_doctor", + lambda target_url, private, scm_provider: type( + "Doctor", + (), + {"ok": True, "checks": []}, + )(), + ) + monkeypatch.setattr("security_scanner.cli.app._store_from_args", lambda args: object()) + + def fake_run_quickstart(request): + calls["request"] = request + return QuickstartSummary( + target_url=request.target_url, + initialized=IncrementalDiscoverySummary(targets=1, refs_observed=1), + current_jobs_enqueued=1, + enqueued=IncrementalDiscoverySummary(targets=1), + worker=ScanWorkerSummary(leased=1, completed=1), + status=QueueStatus( + job_counts_by_status={"completed": 1}, + expired_job_leases=0, + expired_repo_leases=0, + ), + ) + + monkeypatch.setattr("security_scanner.cli.app.run_quickstart", fake_run_quickstart) + + exit_code = main(["quickstart", TARGET_URL]) + + captured = capsys.readouterr() + assert exit_code == 0 + assert calls["request"].target_url == TARGET_URL + assert calls["request"].max_jobs == 10 + assert "current jobs enqueued: 1" in captured.out + assert "worker completed: 1" in captured.out + + +def test_quickstart_cli_fails_on_private_auth_preflight(monkeypatch, capsys): + doctor = type("Doctor", (), {"ok": False, "checks": []})() + monkeypatch.setattr( + "security_scanner.cli.app.run_doctor", + lambda target_url, private, scm_provider: doctor, + ) + monkeypatch.setattr( + "security_scanner.cli.app.render_doctor_result", + lambda result: "fail: github auth - set GH_TOKEN or run gh auth login\n", + ) + + exit_code = main(["quickstart", TARGET_URL, "--private"]) + + captured = capsys.readouterr() + assert exit_code == 1 + assert "set GH_TOKEN" in captured.err + assert "ghp_" not in captured.err + + +def test_quickstart_cli_passes_scm_provider_to_fetcher(monkeypatch, tmp_path): + calls = {} + + monkeypatch.setattr( + "security_scanner.cli.app.run_doctor", + lambda target_url, private, scm_provider: type( + "Doctor", + (), + {"ok": True, "checks": []}, + )(), + ) + monkeypatch.setattr("security_scanner.cli.app._store_from_args", lambda args: object()) + + def fake_fetch_or_clone(url, **kwargs): + calls["fetch"] = (url, kwargs) + return tmp_path / "repo" + + def fake_run_quickstart(request): + fetched = request.fetch_repo(request.target_url) + assert isinstance(fetched, Path) + return QuickstartSummary( + target_url=request.target_url, + initialized=IncrementalDiscoverySummary(targets=1), + current_jobs_enqueued=0, + enqueued=IncrementalDiscoverySummary(targets=1), + worker=None, + status=QueueStatus( + job_counts_by_status={}, + expired_job_leases=0, + expired_repo_leases=0, + ), + ) + + monkeypatch.setattr("security_scanner.cli.app.fetch_or_clone", fake_fetch_or_clone) + monkeypatch.setattr("security_scanner.cli.app.run_quickstart", fake_run_quickstart) + + exit_code = main( + [ + "quickstart", + CUSTOM_GITLAB_URL, + "--scm-provider", + "gitlab", + "--private", + ] + ) + + assert exit_code == 0 + assert calls["fetch"] == ( + CUSTOM_GITLAB_URL, + {"allow_git_fallback": False, "scm_provider": "gitlab"}, + ) diff --git a/tests/test_cli_scan_all.py b/tests/test_cli_scan_all.py index 7a00cc1..4e2bbfb 100644 --- a/tests/test_cli_scan_all.py +++ b/tests/test_cli_scan_all.py @@ -509,6 +509,7 @@ def test_scan_all_lock_contention_exits_three( assert "another scan-all is running" in captured.err assert run_calls == [] assert fetch_calls == [] + assert fake_store.list_calls == 0 records = _read_log(log_path) assert len(records) == 1 diff --git a/tests/test_cli_scan_worker.py b/tests/test_cli_scan_worker.py new file mode 100644 index 0000000..9b5b676 --- /dev/null +++ b/tests/test_cli_scan_worker.py @@ -0,0 +1,192 @@ +"""CLI tests for scan-worker.""" + +from __future__ import annotations + +import datetime as dt + +from security_scanner.cli import main +from security_scanner.core.finding.model import Finding +from security_scanner.storage.base import ScanJob, ScanLedgerEntry, ScanLedgerKey + + +NOW = dt.datetime(2026, 6, 12, 13, 0, tzinfo=dt.UTC) +REPO_ID = "repo_synthetic000000000001" +REPO_URL = "https://github.com/example-org/example-repo" +COMMIT_SHA = "b" * 40 + + +class FakeWorkerStore: + def __init__(self, jobs: list[ScanJob] | None = None) -> None: + self.jobs = list(jobs or []) + self.completed: list[tuple[ScanJob, list[Finding], ScanLedgerEntry]] = [] + self.retry_failures: list[tuple[str, str, dt.datetime]] = [] + self.pending_returns: list[tuple[str, str]] = [] + self.ledger_keys: set[ScanLedgerKey] = set() + self.repo_lease_available = True + + def lease_next_scan_job(self, worker_id, lease_seconds, now): + if not self.jobs: + return None + job = self.jobs.pop(0) + return ScanJob( + **{ + **job.__dict__, + "status": "leased", + "worker_id": worker_id, + "lease_until": now + dt.timedelta(seconds=lease_seconds), + } + ) + + def has_scan_ledger(self, key): + return key in self.ledger_keys + + def acquire_repo_lease(self, repo_id, worker_id, lease_seconds): + return self.repo_lease_available + + def release_repo_lease(self, repo_id, worker_id): + return None + + def complete_processed_job(self, job, findings, ledger): + self.completed.append((job, list(findings), ledger)) + self.ledger_keys.add(ledger.key) + + def record_retryable_failure(self, job_id, error, next_attempt_at): + self.retry_failures.append((job_id, error, next_attempt_at)) + + def return_job_to_pending(self, job_id, reason): + self.pending_returns.append((job_id, reason)) + + +class FakeScanner: + def __init__(self, error: Exception | None = None) -> None: + self.error = error + self.calls: list[dict] = [] + + def scan(self, **kwargs): + self.calls.append(kwargs) + if self.error is not None: + raise self.error + return [ + Finding.create( + repo_full_name=REPO_ID, + rule_id="generic-api-key", + file_path="src/config.py", + line_start=10, + raw_secret="synthetic-value-for-hash", + source_tool="gitleaks", + scan_run_id=kwargs["scan_run_id"], + rule_pack_version=kwargs["rule_pack_version"], + ) + ] + + +def _job(*, attempts: int = 0, max_attempts: int = 3) -> ScanJob: + return ScanJob( + job_id="scan_job_cli_synthetic", + repo_id=REPO_ID, + repo_url=REPO_URL, + ref_name="refs/remotes/origin/main", + old_sha="0" * 40, + new_sha=COMMIT_SHA, + commit_sha=COMMIT_SHA, + commit_range=f"{'0' * 40}..{COMMIT_SHA}", + scanner_name="gitleaks", + scanner_version="unknown", + rule_pack_version="secret-rules-0.1.0", + scanner_config_hash="default", + priority=100, + status="pending", + attempts=attempts, + max_attempts=max_attempts, + worker_id=None, + lease_until=None, + next_attempt_at=NOW, + created_at=NOW, + updated_at=NOW, + ) + + +def _fixed_uuid(): + return type("UUID", (), {"hex": "fixedworkerid000000000000"})() + + +def _patch_worker(monkeypatch, store: FakeWorkerStore, scanner: FakeScanner, tmp_path): + monkeypatch.setattr( + "security_scanner.cli.app.create_finding_store", + lambda backend, **kwargs: store, + ) + monkeypatch.setattr("security_scanner.cli.app.make_default_scanner", lambda: scanner) + monkeypatch.setattr( + "security_scanner.cli.app.fetch_or_clone", + lambda url: tmp_path / "example-repo", + ) + monkeypatch.setattr( + "security_scanner.runtime.scan_worker.uuid.uuid4", + _fixed_uuid, + ) + + +def test_scan_worker_once_empty_queue_exits_zero(monkeypatch, tmp_path, capsys): + store = FakeWorkerStore() + scanner = FakeScanner() + _patch_worker(monkeypatch, store, scanner, tmp_path) + + exit_code = main(["scan-worker", "--once", "--storage-backend", "dynamodb"]) + + captured = capsys.readouterr() + assert exit_code == 0 + assert "leased: 0" in captured.out + assert "completed: 0" in captured.out + + +def test_scan_worker_once_processes_one_job(monkeypatch, tmp_path, capsys): + store = FakeWorkerStore([_job()]) + scanner = FakeScanner() + _patch_worker(monkeypatch, store, scanner, tmp_path) + + exit_code = main( + [ + "scan-worker", + "--once", + "--max-jobs", + "1", + "--worker-id", + "worker-cli", + "--storage-backend", + "dynamodb", + ] + ) + + captured = capsys.readouterr() + assert exit_code == 0 + assert "leased: 1" in captured.out + assert "completed: 1" in captured.out + assert store.completed[0][2].commit_sha == COMMIT_SHA + + +def test_scan_worker_dead_lettered_job_exits_two(monkeypatch, tmp_path, capsys): + store = FakeWorkerStore([_job(attempts=2, max_attempts=3)]) + scanner = FakeScanner(error=RuntimeError("synthetic scanner failure")) + _patch_worker(monkeypatch, store, scanner, tmp_path) + + exit_code = main(["scan-worker", "--once", "--storage-backend", "dynamodb"]) + + captured = capsys.readouterr() + assert exit_code == 2 + assert "dead-lettered: 1" in captured.out + + +def test_scan_worker_requires_once(capsys): + exit_code = main(["scan-worker", "--storage-backend", "dynamodb"]) + + captured = capsys.readouterr() + assert exit_code == 2 + assert "requires --once" in captured.err + + +def test_scan_worker_rejects_jsonl_backend(capsys): + exit_code = main(["scan-worker", "--once", "--storage-backend", "jsonl"]) + + captured = capsys.readouterr() + assert exit_code == 2 + assert "dynamodb only" in captured.err diff --git a/tests/test_container_runtime_config.py b/tests/test_container_runtime_config.py new file mode 100644 index 0000000..a41b5fd --- /dev/null +++ b/tests/test_container_runtime_config.py @@ -0,0 +1,80 @@ +"""Static tests for local container runtime wiring.""" + +from __future__ import annotations + +from pathlib import Path + +import yaml + + +REPO_ROOT = Path(__file__).resolve().parents[1] + + +def test_dockerfile_installs_cli_entrypoint_without_credentials(): + dockerfile = (REPO_ROOT / "Dockerfile").read_text(encoding="utf-8") + + assert "ARG GH_VERSION=2." in dockerfile + assert "ARG GLAB_VERSION=1." in dockerfile + assert "ARG GITLEAKS_VERSION=8." in dockerfile + assert "gh_${GH_VERSION}_linux_${gh_arch}.tar.gz" in dockerfile + assert "glab_${GLAB_VERSION}_linux_${glab_arch}.tar.gz" in dockerfile + assert "gitleaks_${GITLEAKS_VERSION}_linux_${gitleaks_arch}.tar.gz" in dockerfile + assert "gh --version" in dockerfile + assert "glab --version" in dockerfile + assert "gitleaks version" in dockerfile + assert 'ENTRYPOINT ["security-scanner"]' in dockerfile + assert "uv pip install --system ." in dockerfile + + +def test_compose_uses_persistent_dynamodb_local_and_worker_service(): + compose = yaml.safe_load( + (REPO_ROOT / "docker-compose.yml").read_text(encoding="utf-8") + ) + services = compose["services"] + + assert "dynalite" not in services + dynamodb = services["dynamodb-local"] + assert dynamodb["image"].startswith("amazon/dynamodb-local:") + assert "container_name" not in dynamodb + assert dynamodb["user"] == "root" + assert dynamodb["working_dir"] == "/home/dynamodblocal" + assert dynamodb["ports"] == ["${SECURITY_SCANNER_DYNAMO_HOST_PORT:-4567}:8000"] + assert "dynamodb-local-data:/home/dynamodblocal/data" in dynamodb["volumes"] + assert dynamodb["command"][-2:] == ["-dbPath", "./data"] + + worker = services["worker"] + assert worker["depends_on"] == ["dynamodb-local"] + assert worker["environment"] == { + "SECURITY_SCANNER_STORAGE_BACKEND": "dynamodb", + "SECURITY_SCANNER_DYNAMO_ENDPOINT": "http://dynamodb-local:8000", + "SECURITY_SCANNER_DYNAMO_TABLE": "SecurityScannerLocal", + "SECURITY_SCANNER_QUICKSTART_TARGET": "${SECURITY_SCANNER_QUICKSTART_TARGET:-}", + "SECURITY_SCANNER_QUICKSTART_NAME": "${SECURITY_SCANNER_QUICKSTART_NAME:-quickstart-target}", + "SECURITY_SCANNER_SCM_PROVIDER": "${SECURITY_SCANNER_SCM_PROVIDER:-auto}", + } + command = worker["command"][0] + assert "security-scanner quickstart" in command + assert "$$SECURITY_SCANNER_QUICKSTART_TARGET" in command + assert "--scm-provider \"$$SECURITY_SCANNER_SCM_PROVIDER\"" in command + assert "--storage-wait-seconds 60" in command + assert '"$SECURITY_SCANNER_QUICKSTART_TARGET"' not in command + assert '"$SECURITY_SCANNER_SCM_PROVIDER"' not in command + assert "dynamodb-local-data" in compose["volumes"] + assert "repo-cache" in compose["volumes"] + assert "repo-cache:/root/.cache/security-scanner/repos" in worker["volumes"] + + +def test_docs_mark_compose_worker_as_local_verification_only(): + compose = (REPO_ROOT / "docker-compose.yml").read_text(encoding="utf-8") + getting_started = ( + REPO_ROOT / "docs/views/getting-started.md" + ).read_text(encoding="utf-8") + systemd_readme = ( + REPO_ROOT / "deploy/systemd/README.md" + ).read_text(encoding="utf-8") + + assert "local verification" in compose + assert "DynamoDB Local" in compose + assert "로컬 검증 전용" in getting_started + assert "local verification only" in systemd_readme + assert "not a production deployment target" in systemd_readme diff --git a/tests/test_doctor.py b/tests/test_doctor.py new file mode 100644 index 0000000..d1c4a18 --- /dev/null +++ b/tests/test_doctor.py @@ -0,0 +1,151 @@ +"""Tests for local runtime doctor checks.""" + +from __future__ import annotations + +from types import SimpleNamespace + +from security_scanner.cli import main +from security_scanner.runtime.doctor import render_doctor_result, run_doctor + + +def test_doctor_public_target_accepts_git_fallback(monkeypatch): + monkeypatch.setattr( + "security_scanner.runtime.doctor.shutil.which", + lambda binary: f"/usr/bin/{binary}" if binary in {"git", "gitleaks"} else None, + ) + + result = run_doctor( + target_url="https://github.com/example-org/example-repo", + private=False, + ) + + assert result.ok is True + assert "public HTTPS clone" in render_doctor_result(result) + + +def test_doctor_private_github_requires_token_or_auth(monkeypatch, capsys): + monkeypatch.delenv("GH_TOKEN", raising=False) + monkeypatch.setattr( + "security_scanner.runtime.doctor.shutil.which", + lambda binary: f"/usr/bin/{binary}" if binary in {"git", "gitleaks"} else None, + ) + + exit_code = main( + [ + "doctor", + "--target-url", + "https://github.com/example-org/private-repo", + "--private", + ] + ) + + captured = capsys.readouterr() + assert exit_code == 1 + assert "install gh, then set GH_TOKEN" in captured.out + assert "ghp_" not in captured.out + + +def test_doctor_private_github_requires_cli_even_when_token_exists(monkeypatch): + monkeypatch.setenv("GH_TOKEN", "synthetic-token-placeholder") + monkeypatch.setattr( + "security_scanner.runtime.doctor.shutil.which", + lambda binary: f"/usr/bin/{binary}" if binary in {"git", "gitleaks"} else None, + ) + + result = run_doctor( + target_url="https://github.com/example-org/private-repo", + private=True, + ) + + rendered = render_doctor_result(result) + assert result.ok is False + assert "install gh" in rendered + assert "synthetic-token-placeholder" not in rendered + + +def test_doctor_private_github_checks_target_access(monkeypatch): + monkeypatch.delenv("GH_TOKEN", raising=False) + calls = [] + monkeypatch.setattr( + "security_scanner.runtime.doctor.shutil.which", + lambda binary: f"/usr/bin/{binary}", + ) + + def fake_run(cmd, **kwargs): + calls.append((cmd, kwargs)) + return SimpleNamespace(returncode=0, stdout="", stderr="") + + monkeypatch.setattr("security_scanner.runtime.doctor.subprocess.run", fake_run) + + result = run_doctor( + target_url="https://github.com/example-org/private-repo", + private=True, + ) + + assert result.ok is True + assert calls[-1][0] == [ + "gh", + "repo", + "view", + "example-org/private-repo", + "--json", + "name", + ] + assert "gh target access passed" in render_doctor_result(result) + + +def test_doctor_private_gitlab_named_custom_domain_sets_host(monkeypatch): + calls = [] + monkeypatch.setattr( + "security_scanner.runtime.doctor.shutil.which", + lambda binary: f"/usr/bin/{binary}", + ) + + def fake_run(cmd, **kwargs): + calls.append((cmd, kwargs)) + return SimpleNamespace(returncode=0, stdout="", stderr="") + + monkeypatch.setattr("security_scanner.runtime.doctor.subprocess.run", fake_run) + + result = run_doctor( + target_url="https://gitlab.example.com/team/private-repo", + private=True, + ) + + assert result.ok is True + assert calls[-1][0] == [ + "glab", + "repo", + "view", + "https://gitlab.example.com/team/private-repo", + ] + assert calls[-1][1]["env"]["GITLAB_HOST"] == "gitlab.example.com" + + +def test_doctor_private_custom_gitlab_domain_uses_provider_hint(monkeypatch): + calls = [] + monkeypatch.setattr( + "security_scanner.runtime.doctor.shutil.which", + lambda binary: f"/usr/bin/{binary}", + ) + + def fake_run(cmd, **kwargs): + calls.append((cmd, kwargs)) + return SimpleNamespace(returncode=0, stdout="", stderr="") + + monkeypatch.setattr("security_scanner.runtime.doctor.subprocess.run", fake_run) + + result = run_doctor( + target_url="https://source.example.test/team/private-repo", + private=True, + scm_provider="gitlab", + ) + + assert result.ok is True + assert calls[-1][0] == [ + "glab", + "repo", + "view", + "https://source.example.test/team/private-repo", + ] + assert calls[-1][1]["env"]["GITLAB_HOST"] == "source.example.test" diff --git a/tests/test_fetcher.py b/tests/test_fetcher.py index 61e843b..d4bbb38 100644 --- a/tests/test_fetcher.py +++ b/tests/test_fetcher.py @@ -34,7 +34,7 @@ def test_github_url_dispatches_gh_repo_clone(monkeypatch, tmp_path): cache_root=tmp_path, ) - expected_path = tmp_path / "octocat" / "hello-world" + expected_path = tmp_path / "github.com" / "octocat" / "hello-world" assert result == expected_path assert len(calls) == 1 cmd, kwargs = calls[0] @@ -57,7 +57,7 @@ def test_gitlab_com_url_dispatches_glab(monkeypatch, tmp_path): cache_root=tmp_path, ) - expected_path = tmp_path / "group" / "project" + expected_path = tmp_path / "gitlab.com" / "group" / "project" assert result == expected_path cmd, _ = calls[0] assert cmd[:2] == ["glab", "repo"] @@ -66,21 +66,42 @@ def test_gitlab_com_url_dispatches_glab(monkeypatch, tmp_path): assert cmd[4] == str(expected_path) -def test_self_hosted_gitlab_dispatches_glab(monkeypatch, tmp_path): +def test_gitlab_named_custom_domain_dispatches_glab(monkeypatch, tmp_path): calls = [] monkeypatch.setattr( "security_scanner.targets.fetcher.subprocess.run", _record_run(calls), ) - fetch_or_clone( + result = fetch_or_clone( "https://gitlab.example.com/team/svc", cache_root=tmp_path, ) cmd, _ = calls[0] + assert result == tmp_path / "gitlab.example.com" / "team" / "svc" assert cmd[0] == "glab" - assert cmd[3] == "team/svc" + assert cmd[3] == "https://gitlab.example.com/team/svc" + + +def test_custom_gitlab_domain_uses_provider_hint(monkeypatch, tmp_path): + calls = [] + monkeypatch.setattr( + "security_scanner.targets.fetcher.subprocess.run", + _record_run(calls), + ) + + result = fetch_or_clone( + "https://source.example.test/team/svc", + cache_root=tmp_path, + scm_provider="gitlab", + ) + + cmd, kwargs = calls[0] + assert result == tmp_path / "source.example.test" / "team" / "svc" + assert cmd[0] == "glab" + assert cmd[3] == "https://source.example.test/team/svc" + assert kwargs["env"]["GITLAB_HOST"] == "source.example.test" def test_unsupported_host_raises(monkeypatch, tmp_path): @@ -97,7 +118,7 @@ def test_unsupported_host_raises(monkeypatch, tmp_path): def test_existing_cache_path_triggers_git_fetch(monkeypatch, tmp_path): - cache_path = tmp_path / "octocat" / "hello-world" + cache_path = tmp_path / "github.com" / "octocat" / "hello-world" (cache_path / ".git").mkdir(parents=True) calls = [] @@ -130,15 +151,53 @@ def test_default_cache_root_uses_home(monkeypatch, tmp_path): result = fetch_or_clone("https://github.com/octocat/hello-world") expected_path = ( - tmp_path / ".cache" / "security-scanner" / "repos" / "octocat" / "hello-world" + tmp_path + / ".cache" + / "security-scanner" + / "repos" + / "github.com" + / "octocat" + / "hello-world" ) assert result == expected_path cmd, _ = calls[0] assert cmd[4] == str(expected_path) -def test_missing_binary_raises_fetch_error(monkeypatch, tmp_path): +def test_missing_gh_falls_back_to_public_git_clone(monkeypatch, tmp_path): + calls = [] + def fake_run(cmd, **kwargs): + calls.append(cmd) + if cmd[0] == "gh": + raise FileNotFoundError("gh not on PATH") + return SimpleNamespace(returncode=0, stdout="", stderr="") + + monkeypatch.setattr( + "security_scanner.targets.fetcher.subprocess.run", + fake_run, + ) + + result = fetch_or_clone( + "https://github.com/octocat/hello-world", + cache_root=tmp_path, + ) + + assert result == tmp_path / "github.com" / "octocat" / "hello-world" + assert calls[0][:3] == ["gh", "repo", "clone"] + assert calls[1] == [ + "git", + "clone", + "https://github.com/octocat/hello-world", + str(result), + ] + + +def test_private_fetch_can_disable_public_git_fallback(monkeypatch, tmp_path): + calls = [] + + def fake_run(cmd, **kwargs): + calls.append(cmd) raise FileNotFoundError("gh not on PATH") monkeypatch.setattr( @@ -146,12 +205,16 @@ def fake_run(cmd, **kwargs): fake_run, ) - with pytest.raises(FetchError, match="gh"): + with pytest.raises(FetchError, match="gh binary not found"): fetch_or_clone( "https://github.com/octocat/hello-world", cache_root=tmp_path, + allow_git_fallback=False, ) + assert len(calls) == 1 + assert calls[0][:3] == ["gh", "repo", "clone"] + def test_subprocess_failure_raises_fetch_error(monkeypatch, tmp_path): import subprocess @@ -164,7 +227,7 @@ def fake_run(cmd, **kwargs): fake_run, ) - with pytest.raises(FetchError): + with pytest.raises(FetchError, match="fallback failed"): fetch_or_clone( "https://github.com/octocat/hello-world", cache_root=tmp_path, diff --git a/tests/test_gitleaks_runner.py b/tests/test_gitleaks_runner.py index e679519..279de2d 100644 --- a/tests/test_gitleaks_runner.py +++ b/tests/test_gitleaks_runner.py @@ -48,6 +48,28 @@ def test_runner_builds_dir_mode_without_history(): assert cmd[cmd.index("--config") + 1] == "rules.toml" +def test_runner_adds_log_opts_in_git_mode_only(): + runner = GitleaksRunner() + + cmd = runner.build_command( + FAKE_ROOT, + scan_options=ScanOptions(include_history=True, git_log_opts="abc123^!"), + ) + + assert cmd[1] == "git" + assert cmd[cmd.index("--log-opts") + 1] == "abc123^!" + + +def test_runner_rejects_log_opts_in_dir_mode(): + runner = GitleaksRunner() + + with pytest.raises(ValueError, match="git_log_opts"): + runner.build_command( + FAKE_ROOT, + scan_options=ScanOptions(include_history=False, git_log_opts="abc123^!"), + ) + + def test_runner_executes_process_and_reads_report(monkeypatch, tmp_path): root = tmp_path / "repo" root.mkdir() diff --git a/tests/test_incremental_discovery.py b/tests/test_incremental_discovery.py new file mode 100644 index 0000000..1c4dc13 --- /dev/null +++ b/tests/test_incremental_discovery.py @@ -0,0 +1,369 @@ +"""Tests for incremental ref discovery orchestration.""" + +from __future__ import annotations + +import datetime as dt +from pathlib import Path + +from security_scanner.catalog.scan_target import ScanTarget +from security_scanner.runtime.incremental_discovery import ( + DISCOVERY_MODE_ENQUEUE, + DISCOVERY_MODE_INITIALIZE, + DiscoveryScannerConfig, + GitRef, + IncrementalDiscoveryRequest, + run_incremental_discovery, +) +from security_scanner.storage.adapters.nosql_db.items import repo_id_for_scan_target_url +from security_scanner.storage.base import RefState, ScanJob, ScanLedgerKey + + +NOW = dt.datetime(2026, 6, 12, 10, 0, tzinfo=dt.UTC) +TARGET = ScanTarget( + url="https://github.com/example-org/example-repo", + name="example-org/example-repo", + enabled=True, +) +REF_MAIN = "refs/remotes/origin/main" +OLD_SHA = "a" * 40 +MID_SHA = "b" * 40 +NEW_SHA = "c" * 40 + + +class FakeIncrementalStore: + def __init__(self, targets: list[ScanTarget]) -> None: + self.targets = targets + self.ref_states: dict[tuple[str, str], RefState] = {} + self.jobs: dict[str, ScanJob] = {} + self.ledger: set[ScanLedgerKey] = set() + + def list_scan_targets(self) -> list[ScanTarget]: + return list(self.targets) + + def get_ref_state(self, repo_id: str, ref_name: str) -> RefState | None: + return self.ref_states.get((repo_id, ref_name)) + + def put_ref_state(self, state: RefState) -> None: + self.ref_states[(state.repo_id, state.ref_name)] = state + + def has_scan_ledger(self, key: ScanLedgerKey) -> bool: + return key in self.ledger + + def enqueue_commit_scan_job(self, job: ScanJob) -> bool: + if job.job_id in self.jobs: + return False + self.jobs[job.job_id] = job + return True + + +class FakeGitDiscovery: + def __init__(self) -> None: + self.refs_by_path: dict[Path, list[GitRef]] = {} + self.ancestor_results: dict[tuple[Path, str, str], bool] = {} + self.commits_by_range: dict[tuple[Path, str, str], list[str]] = {} + + def fetch(self, repo_path: Path) -> None: + return None + + def list_remote_refs( + self, + repo_path: Path, + patterns, + ) -> list[GitRef]: + refs = self.refs_by_path.get(repo_path, []) + if not patterns: + return refs + import fnmatch + + return [ + ref + for ref in refs + if any(fnmatch.fnmatch(ref.ref_name, pattern) for pattern in patterns) + ] + + def is_ancestor(self, repo_path: Path, old_sha: str, new_sha: str) -> bool: + return self.ancestor_results[(repo_path, old_sha, new_sha)] + + def list_new_commits( + self, + repo_path: Path, + old_sha: str, + new_sha: str, + ) -> list[str]: + return self.commits_by_range[(repo_path, old_sha, new_sha)] + + +def _scanner() -> DiscoveryScannerConfig: + return DiscoveryScannerConfig( + scanner_name="gitleaks", + scanner_version="unknown", + rule_pack_version="secret-rules-0.1.0", + scanner_config_hash="default", + ) + + +def _request( + *, + mode: str, + store: FakeIncrementalStore, + git: FakeGitDiscovery, + fetch_repo=None, + ref_patterns=None, +) -> IncrementalDiscoveryRequest: + repo_path = Path("/synthetic-cache/example-repo") + return IncrementalDiscoveryRequest( + mode=mode, + store=store, + fetch_repo=fetch_repo or (lambda url: repo_path), + git=git, + scanner=_scanner(), + ref_patterns=ref_patterns or ("refs/remotes/origin/*",), + now_factory=lambda: NOW, + ) + + +def test_initialize_writes_ref_state_and_enqueues_zero_jobs(): + repo_path = Path("/synthetic-cache/example-repo") + store = FakeIncrementalStore([TARGET]) + git = FakeGitDiscovery() + git.refs_by_path[repo_path] = [GitRef(ref_name=REF_MAIN, commit_sha=NEW_SHA)] + + summary = run_incremental_discovery( + _request( + mode=DISCOVERY_MODE_INITIALIZE, + store=store, + git=git, + fetch_repo=lambda url: repo_path, + ) + ) + + repo_id = repo_id_for_scan_target_url(TARGET.url) + assert summary.targets == 1 + assert summary.fetch_ok == 1 + assert summary.refs_observed == 1 + assert summary.jobs_enqueued == 0 + assert store.jobs == {} + assert store.ref_states[(repo_id, REF_MAIN)].last_seen_sha == NEW_SHA + + +def test_enqueue_creates_one_job_per_new_unscanned_commit_and_advances_ref_state(): + repo_path = Path("/synthetic-cache/example-repo") + repo_id = repo_id_for_scan_target_url(TARGET.url) + store = FakeIncrementalStore([TARGET]) + store.put_ref_state( + RefState( + repo_id=repo_id, + repo_url=TARGET.url, + ref_name=REF_MAIN, + last_seen_sha=OLD_SHA, + updated_at=NOW, + ) + ) + git = FakeGitDiscovery() + git.refs_by_path[repo_path] = [GitRef(ref_name=REF_MAIN, commit_sha=NEW_SHA)] + git.ancestor_results[(repo_path, OLD_SHA, NEW_SHA)] = True + git.commits_by_range[(repo_path, OLD_SHA, NEW_SHA)] = [MID_SHA, NEW_SHA] + + summary = run_incremental_discovery( + _request( + mode=DISCOVERY_MODE_ENQUEUE, + store=store, + git=git, + fetch_repo=lambda url: repo_path, + ) + ) + + assert summary.jobs_enqueued == 2 + assert {job.commit_sha for job in store.jobs.values()} == {MID_SHA, NEW_SHA} + assert {job.commit_range for job in store.jobs.values()} == {f"{OLD_SHA}..{NEW_SHA}"} + assert store.ref_states[(repo_id, REF_MAIN)].last_seen_sha == NEW_SHA + + +def test_enqueue_missing_ref_state_observes_without_backfill(): + repo_path = Path("/synthetic-cache/example-repo") + repo_id = repo_id_for_scan_target_url(TARGET.url) + store = FakeIncrementalStore([TARGET]) + git = FakeGitDiscovery() + git.refs_by_path[repo_path] = [GitRef(ref_name=REF_MAIN, commit_sha=NEW_SHA)] + + summary = run_incremental_discovery( + _request( + mode=DISCOVERY_MODE_ENQUEUE, + store=store, + git=git, + fetch_repo=lambda url: repo_path, + ) + ) + + assert summary.jobs_enqueued == 0 + assert store.jobs == {} + assert store.ref_states[(repo_id, REF_MAIN)].last_seen_sha == NEW_SHA + + +def test_enqueue_skips_commits_present_in_ledger(): + repo_path = Path("/synthetic-cache/example-repo") + repo_id = repo_id_for_scan_target_url(TARGET.url) + store = FakeIncrementalStore([TARGET]) + store.put_ref_state( + RefState( + repo_id=repo_id, + repo_url=TARGET.url, + ref_name=REF_MAIN, + last_seen_sha=OLD_SHA, + updated_at=NOW, + ) + ) + scanner = _scanner() + store.ledger.add( + ScanLedgerKey( + repo_id=repo_id, + commit_sha=MID_SHA, + scanner_name=scanner.scanner_name, + scanner_version=scanner.scanner_version, + rule_pack_version=scanner.rule_pack_version, + scanner_config_hash=scanner.scanner_config_hash, + ) + ) + git = FakeGitDiscovery() + git.refs_by_path[repo_path] = [GitRef(ref_name=REF_MAIN, commit_sha=NEW_SHA)] + git.ancestor_results[(repo_path, OLD_SHA, NEW_SHA)] = True + git.commits_by_range[(repo_path, OLD_SHA, NEW_SHA)] = [MID_SHA, NEW_SHA] + + summary = run_incremental_discovery( + _request( + mode=DISCOVERY_MODE_ENQUEUE, + store=store, + git=git, + fetch_repo=lambda url: repo_path, + ) + ) + + assert summary.ledger_skipped == 1 + assert summary.jobs_enqueued == 1 + assert [job.commit_sha for job in store.jobs.values()] == [NEW_SHA] + assert store.ref_states[(repo_id, REF_MAIN)].last_seen_sha == NEW_SHA + + +def test_non_fast_forward_is_reported_without_advancing_ref_state(): + repo_path = Path("/synthetic-cache/example-repo") + repo_id = repo_id_for_scan_target_url(TARGET.url) + store = FakeIncrementalStore([TARGET]) + store.put_ref_state( + RefState( + repo_id=repo_id, + repo_url=TARGET.url, + ref_name=REF_MAIN, + last_seen_sha=OLD_SHA, + updated_at=NOW, + ) + ) + git = FakeGitDiscovery() + git.refs_by_path[repo_path] = [GitRef(ref_name=REF_MAIN, commit_sha=NEW_SHA)] + git.ancestor_results[(repo_path, OLD_SHA, NEW_SHA)] = False + + summary = run_incremental_discovery( + _request( + mode=DISCOVERY_MODE_ENQUEUE, + store=store, + git=git, + fetch_repo=lambda url: repo_path, + ) + ) + + assert summary.skipped_non_fast_forward == 1 + assert summary.has_partial_failure is True + assert store.jobs == {} + assert store.ref_states[(repo_id, REF_MAIN)].last_seen_sha == OLD_SHA + + +def test_fetch_failure_is_isolated_per_target(): + store = FakeIncrementalStore( + [ + TARGET, + ScanTarget( + url="https://github.com/example-org/failing-repo", + name="example-org/failing-repo", + enabled=True, + ), + ] + ) + git = FakeGitDiscovery() + repo_path = Path("/synthetic-cache/example-repo") + git.refs_by_path[repo_path] = [GitRef(ref_name=REF_MAIN, commit_sha=NEW_SHA)] + + def fetch_repo(url: str) -> Path: + if url.endswith("/failing-repo"): + raise RuntimeError("synthetic fetch failure") + return repo_path + + summary = run_incremental_discovery( + _request( + mode=DISCOVERY_MODE_INITIALIZE, + store=store, + git=git, + fetch_repo=fetch_repo, + ) + ) + + assert summary.targets == 2 + assert summary.fetch_ok == 1 + assert summary.fetch_failed_count == 1 + assert summary.has_partial_failure is True + + +def test_disabled_targets_are_ignored_and_max_targets_limits_enabled_targets(): + first = TARGET + second = ScanTarget( + url="https://github.com/example-org/second-repo", + name="example-org/second-repo", + enabled=True, + ) + disabled = ScanTarget( + url="https://github.com/example-org/disabled-repo", + name="example-org/disabled-repo", + enabled=False, + ) + store = FakeIncrementalStore([first, disabled, second]) + git = FakeGitDiscovery() + repo_path = Path("/synthetic-cache/example-repo") + git.refs_by_path[repo_path] = [GitRef(ref_name=REF_MAIN, commit_sha=NEW_SHA)] + + summary = run_incremental_discovery( + IncrementalDiscoveryRequest( + mode=DISCOVERY_MODE_INITIALIZE, + store=store, + fetch_repo=lambda url: repo_path, + git=git, + scanner=_scanner(), + max_targets=1, + now_factory=lambda: NOW, + ) + ) + + assert summary.targets == 1 + assert len(store.ref_states) == 1 + assert next(iter(store.ref_states.values())).repo_id == repo_id_for_scan_target_url( + first.url + ) + + +def test_ref_patterns_limit_observed_refs(): + repo_path = Path("/synthetic-cache/example-repo") + store = FakeIncrementalStore([TARGET]) + git = FakeGitDiscovery() + git.refs_by_path[repo_path] = [ + GitRef(ref_name=REF_MAIN, commit_sha=NEW_SHA), + GitRef(ref_name="refs/remotes/origin/feature", commit_sha=MID_SHA), + ] + + summary = run_incremental_discovery( + _request( + mode=DISCOVERY_MODE_INITIALIZE, + store=store, + git=git, + fetch_repo=lambda url: repo_path, + ref_patterns=("refs/remotes/origin/main",), + ) + ) + + assert summary.refs_observed == 1 diff --git a/tests/test_incremental_scan_smoke.py b/tests/test_incremental_scan_smoke.py new file mode 100644 index 0000000..f6f74c6 --- /dev/null +++ b/tests/test_incremental_scan_smoke.py @@ -0,0 +1,294 @@ +"""Synthetic smoke tests for the incremental scan MVP flow.""" + +from __future__ import annotations + +import datetime as dt +from collections import Counter +from pathlib import Path +from typing import Sequence + +from security_scanner.catalog.scan_target import ScanTarget +from security_scanner.core.finding.model import Finding +from security_scanner.runtime.incremental_discovery import ( + DISCOVERY_MODE_ENQUEUE, + DISCOVERY_MODE_INITIALIZE, + DiscoveryScannerConfig, + GitRef, + IncrementalDiscoveryRequest, + run_incremental_discovery, +) +from security_scanner.runtime.queue_status import QueueStatusRequest, read_queue_status +from security_scanner.runtime.scan_worker import ScanWorkerRequest, run_scan_worker_once +from security_scanner.storage.adapters.nosql_db.items import repo_id_for_scan_target_url +from security_scanner.storage.base import ( + QueueStatus, + RefState, + RepoLease, + ScanJob, + ScanLedgerEntry, + ScanLedgerKey, +) + + +NOW = dt.datetime(2026, 6, 12, 15, 0, tzinfo=dt.UTC) +TARGET = ScanTarget( + url="https://github.com/example-org/example-repo", + name="example-org/example-repo", + enabled=True, +) +REF_MAIN = "refs/remotes/origin/main" +OLD_SHA = "1" * 40 +NEW_SHA = "2" * 40 +SCANNER = DiscoveryScannerConfig( + scanner_name="gitleaks", + scanner_version="unknown", + rule_pack_version="secret-rules-0.1.0", + scanner_config_hash="default", +) + + +class InMemoryIncrementalStore: + def __init__(self, targets: Sequence[ScanTarget]) -> None: + self.targets = list(targets) + self.ref_states: dict[tuple[str, str], RefState] = {} + self.jobs: dict[str, ScanJob] = {} + self.ledger: dict[ScanLedgerKey, ScanLedgerEntry] = {} + self.repo_leases: dict[str, RepoLease] = {} + self.findings: list[Finding] = [] + + def list_scan_targets(self) -> list[ScanTarget]: + return list(self.targets) + + def get_ref_state(self, repo_id: str, ref_name: str) -> RefState | None: + return self.ref_states.get((repo_id, ref_name)) + + def put_ref_state(self, state: RefState) -> None: + self.ref_states[(state.repo_id, state.ref_name)] = state + + def has_scan_ledger(self, key: ScanLedgerKey) -> bool: + return key in self.ledger + + def enqueue_commit_scan_job(self, job: ScanJob) -> bool: + if job.job_id in self.jobs: + return False + self.jobs[job.job_id] = job + return True + + def lease_next_scan_job( + self, + worker_id: str, + lease_seconds: int, + now: dt.datetime, + ) -> ScanJob | None: + for job in sorted(self.jobs.values(), key=lambda item: item.created_at): + if job.status != "pending" or job.next_attempt_at > now: + continue + leased = ScanJob( + **{ + **job.__dict__, + "status": "leased", + "worker_id": worker_id, + "lease_until": now + dt.timedelta(seconds=lease_seconds), + "updated_at": now, + } + ) + self.jobs[job.job_id] = leased + return leased + return None + + def complete_processed_job( + self, + job: ScanJob, + findings: Sequence[Finding], + ledger: ScanLedgerEntry, + ) -> None: + if ledger.key not in self.ledger: + self.findings.extend(findings) + self.ledger[ledger.key] = ledger + self.jobs[job.job_id] = ScanJob( + **{ + **job.__dict__, + "status": "completed", + "worker_id": None, + "lease_until": None, + "updated_at": ledger.scanned_at, + } + ) + + def record_retryable_failure( + self, + job_id: str, + error: str, + next_attempt_at: dt.datetime, + ) -> None: + raise AssertionError(f"unexpected retryable failure: {job_id} {error}") + + def move_job_to_dead_letter(self, job_id: str, error: str) -> None: + raise AssertionError(f"unexpected dead letter: {job_id} {error}") + + def return_job_to_pending(self, job_id: str, reason: str) -> None: + raise AssertionError(f"unexpected pending return: {job_id} {reason}") + + def acquire_repo_lease( + self, + repo_id: str, + worker_id: str, + lease_seconds: int, + ) -> bool: + self.repo_leases[repo_id] = RepoLease( + repo_id=repo_id, + worker_id=worker_id, + lease_until=NOW + dt.timedelta(seconds=lease_seconds), + updated_at=NOW, + ) + return True + + def release_repo_lease(self, repo_id: str, worker_id: str) -> None: + lease = self.repo_leases.get(repo_id) + if lease and lease.worker_id == worker_id: + del self.repo_leases[repo_id] + + def get_queue_status(self, now: dt.datetime) -> QueueStatus: + counts = Counter(job.status for job in self.jobs.values()) + expired_job_leases = sum( + 1 + for job in self.jobs.values() + if job.status == "leased" + and job.lease_until is not None + and job.lease_until <= now + ) + expired_repo_leases = sum( + 1 for lease in self.repo_leases.values() if lease.lease_until <= now + ) + return QueueStatus( + job_counts_by_status=dict(counts), + expired_job_leases=expired_job_leases, + expired_repo_leases=expired_repo_leases, + ) + + +class FakeGit: + def __init__(self, repo_path: Path, commit_sha: str) -> None: + self.repo_path = repo_path + self.commit_sha = commit_sha + + def fetch(self, repo_path: Path) -> None: + return None + + def list_remote_refs(self, repo_path: Path, patterns) -> list[GitRef]: + assert repo_path == self.repo_path + return [GitRef(ref_name=REF_MAIN, commit_sha=self.commit_sha)] + + def is_ancestor(self, repo_path: Path, old_sha: str, new_sha: str) -> bool: + assert (repo_path, old_sha, new_sha) == (self.repo_path, OLD_SHA, NEW_SHA) + return True + + def list_new_commits(self, repo_path: Path, old_sha: str, new_sha: str) -> list[str]: + assert (repo_path, old_sha, new_sha) == (self.repo_path, OLD_SHA, NEW_SHA) + return [NEW_SHA] + + +class FakeScanner: + def __init__(self) -> None: + self.calls: list[dict] = [] + + def scan(self, **kwargs) -> list[Finding]: + self.calls.append(kwargs) + return [ + Finding.create( + repo_full_name=kwargs["repo_full_name"], + rule_id="generic-api-key", + file_path="src/config.py", + line_start=10, + raw_secret="synthetic-value-for-hash", + source_tool="gitleaks", + scan_run_id=kwargs["scan_run_id"], + rule_pack_version=kwargs["rule_pack_version"], + ) + ] + + +def test_incremental_scan_flow_initializes_enqueues_scans_and_skips_ledger(tmp_path): + repo_path = tmp_path / "example-repo" + repo_path.mkdir() + store = InMemoryIncrementalStore([TARGET]) + git = FakeGit(repo_path=repo_path, commit_sha=OLD_SHA) + fetch_repo = lambda url: repo_path + + initialize_summary = run_incremental_discovery( + IncrementalDiscoveryRequest( + mode=DISCOVERY_MODE_INITIALIZE, + store=store, + fetch_repo=fetch_repo, + git=git, + scanner=SCANNER, + now_factory=lambda: NOW, + ) + ) + + repo_id = repo_id_for_scan_target_url(TARGET.url) + assert initialize_summary.refs_observed == 1 + assert initialize_summary.jobs_enqueued == 0 + assert store.ref_states[(repo_id, REF_MAIN)].last_seen_sha == OLD_SHA + + git.commit_sha = NEW_SHA + enqueue_summary = run_incremental_discovery( + IncrementalDiscoveryRequest( + mode=DISCOVERY_MODE_ENQUEUE, + store=store, + fetch_repo=fetch_repo, + git=git, + scanner=SCANNER, + now_factory=lambda: NOW + dt.timedelta(minutes=1), + ) + ) + + assert enqueue_summary.jobs_enqueued == 1 + assert [job.commit_sha for job in store.jobs.values()] == [NEW_SHA] + assert store.ref_states[(repo_id, REF_MAIN)].last_seen_sha == NEW_SHA + + scanner = FakeScanner() + worker_summary = run_scan_worker_once( + ScanWorkerRequest( + store=store, + fetch_repo=fetch_repo, + scanner=scanner, + max_jobs=1, + worker_id="worker-smoke", + now_factory=lambda: NOW + dt.timedelta(minutes=2), + ) + ) + + assert worker_summary.completed == 1 + assert len(store.ledger) == 1 + assert store.findings[0].repo.commit == NEW_SHA + assert scanner.calls[0]["scan_options"].git_log_opts == f"{NEW_SHA}^!" + + store.put_ref_state( + RefState( + repo_id=repo_id, + repo_url=TARGET.url, + ref_name=REF_MAIN, + last_seen_sha=OLD_SHA, + updated_at=NOW + dt.timedelta(minutes=3), + ) + ) + replay_summary = run_incremental_discovery( + IncrementalDiscoveryRequest( + mode=DISCOVERY_MODE_ENQUEUE, + store=store, + fetch_repo=fetch_repo, + git=git, + scanner=SCANNER, + now_factory=lambda: NOW + dt.timedelta(minutes=4), + ) + ) + + assert replay_summary.jobs_enqueued == 0 + assert replay_summary.ledger_skipped == 1 + assert read_queue_status( + QueueStatusRequest( + store=store, + now_factory=lambda: NOW + dt.timedelta(minutes=5), + ) + ).job_counts_by_status == {"completed": 1} diff --git a/tests/test_incremental_scan_storage.py b/tests/test_incremental_scan_storage.py new file mode 100644 index 0000000..6cc8dc4 --- /dev/null +++ b/tests/test_incremental_scan_storage.py @@ -0,0 +1,564 @@ +"""Tests for incremental scan queue/ledger storage.""" + +from __future__ import annotations + +import datetime as dt +import hashlib + +from security_scanner.core.finding.model import Finding +from security_scanner.storage.adapters.nosql_db.items import ( + datetime_to_iso, + ref_state_from_item, + ref_state_to_item, + repo_id_for_scan_target_url, + repo_lease_from_item, + repo_lease_to_item, + scan_job_from_item, + scan_job_id_for, + scan_job_to_item, + scan_ledger_entry_from_item, + scan_ledger_entry_to_item, +) +from security_scanner.storage.adapters.nosql_db.store import ( + DynamoDbCompatibleFindingStore, +) +from security_scanner.storage.adapters.nosql_db.transport import ( + DynamoDbCompatibleConfig, +) +from security_scanner.storage.base import ( + RefState, + RepoLease, + ScanJob, + ScanLedgerEntry, +) + + +NOW = dt.datetime(2026, 6, 12, 9, 0, tzinfo=dt.UTC) +REPO_URL = "https://github.com/example-org/example-repo" +REPO_ID = repo_id_for_scan_target_url(REPO_URL) +COMMIT_SHA = "a" * 40 +NEW_SHA = "b" * 40 +OLD_SHA = "c" * 40 +SCANNER_NAME = "gitleaks" +SCANNER_VERSION = "8.24.0" +RULE_PACK_VERSION = "secret-rules-0.1.0" +SCANNER_CONFIG_HASH = "config_hash_synthetic" + + +class FakeIncrementalTable: + class ConditionalCheckFailedException(Exception): + pass + + def __init__(self) -> None: + self.items: list[dict] = [] + self.put_calls: list[dict] = [] + self.delete_calls: list[dict] = [] + self.query_calls: list[dict] = [] + self.scan_calls: list[dict] = [] + + def put_item(self, *, Item: dict, **kwargs) -> dict: # noqa: N803 + existing = self._get_existing(Item["PK"], Item["SK"]) + if not self._condition_allows(kwargs, existing): + raise self.ConditionalCheckFailedException("conditional check failed") + + self.items = [ + item + for item in self.items + if not (item.get("PK") == Item["PK"] and item.get("SK") == Item["SK"]) + ] + self.items.append(dict(Item)) + self.put_calls.append(dict(Item)) + return {"ResponseMetadata": {"HTTPStatusCode": 200}} + + def get_item(self, *, Key: dict) -> dict: # noqa: N803 + item = self._get_existing(Key["PK"], Key["SK"]) + return {"Item": dict(item)} if item else {} + + def delete_item(self, *, Key: dict, **kwargs) -> dict: # noqa: N803 + existing = self._get_existing(Key["PK"], Key["SK"]) + if not self._condition_allows(kwargs, existing): + raise self.ConditionalCheckFailedException("conditional check failed") + + self.items = [ + item + for item in self.items + if not (item.get("PK") == Key["PK"] and item.get("SK") == Key["SK"]) + ] + self.delete_calls.append(dict(Key)) + return {"ResponseMetadata": {"HTTPStatusCode": 200}} + + def query(self, **kwargs) -> dict: + self.query_calls.append(kwargs) + index_name = kwargs.get("IndexName") + pk_attr, sk_attr = ( + ("gsi1pk", "gsi1sk") + if index_name == "GSI1" + else ("gsi2pk", "gsi2sk") + if index_name == "GSI2" + else ("PK", "SK") + ) + values = kwargs["ExpressionAttributeValues"] + pk = values[":pk"] + sk_prefix = values.get(":sk_prefix") + items = [ + item + for item in self.items + if item.get(pk_attr) == pk + and (sk_prefix is None or str(item.get(sk_attr, "")).startswith(sk_prefix)) + ] + items.sort( + key=lambda item: item.get(sk_attr, ""), + reverse=not kwargs.get("ScanIndexForward", True), + ) + return {"Items": [dict(item) for item in items]} + + def scan(self, **kwargs) -> dict: + self.scan_calls.append(kwargs) + entity_type = kwargs["ExpressionAttributeValues"][":entity_type"] + return { + "Items": [ + dict(item) + for item in self.items + if item.get("entityType") == entity_type + ] + } + + def _get_existing(self, pk: str, sk: str) -> dict | None: + for item in self.items: + if item.get("PK") == pk and item.get("SK") == sk: + return item + return None + + def _condition_allows(self, kwargs: dict, existing: dict | None) -> bool: + expression = kwargs.get("ConditionExpression") + if expression is None: + return True + values = kwargs.get("ExpressionAttributeValues", {}) + + if expression == "attribute_not_exists(PK) AND attribute_not_exists(SK)": + return existing is None + if expression == "attribute_exists(PK) AND attribute_exists(SK)": + return existing is not None + if expression == "attribute_not_exists(PK) OR leaseUntil <= :now": + return existing is None or existing["leaseUntil"] <= values[":now"] + if expression == "workerId = :worker_id": + return existing is not None and existing.get("workerId") == values[":worker_id"] + if "nextAttemptAt <= :now" in expression: + if existing is None: + return False + return ( + existing.get("status") == values[":pending"] + and existing.get("nextAttemptAt") <= values[":now"] + ) or ( + existing.get("status") == values[":leased"] + and existing.get("leaseUntil", "") <= values[":now"] + ) + if "#status = :leased OR #status = :completed" in expression: + return existing is not None and existing.get("status") in { + values[":leased"], + values[":completed"], + } + raise AssertionError(f"unsupported condition expression: {expression}") + + +class FakeIncrementalResource: + def __init__(self, table: FakeIncrementalTable) -> None: + self.table = table + + def Table(self, table_name: str) -> FakeIncrementalTable: # noqa: N802 + return self.table + + +class FakeIncrementalClient: + class exceptions: + class ResourceInUseException(Exception): + pass + + +def _make_store() -> tuple[DynamoDbCompatibleFindingStore, FakeIncrementalTable]: + table = FakeIncrementalTable() + store = DynamoDbCompatibleFindingStore( + DynamoDbCompatibleConfig(table_name="SecurityScannerLocal"), + resource=FakeIncrementalResource(table), + client=FakeIncrementalClient(), + ) + return store, table + + +def _make_job( + *, + commit_sha: str = COMMIT_SHA, + status: str = "pending", + attempts: int = 0, + max_attempts: int = 3, + worker_id: str | None = None, + lease_until: dt.datetime | None = None, + next_attempt_at: dt.datetime = NOW, +) -> ScanJob: + job_id = scan_job_id_for( + repo_id=REPO_ID, + commit_sha=commit_sha, + scanner_name=SCANNER_NAME, + scanner_version=SCANNER_VERSION, + rule_pack_version=RULE_PACK_VERSION, + scanner_config_hash=SCANNER_CONFIG_HASH, + ) + return ScanJob( + job_id=job_id, + repo_id=REPO_ID, + repo_url=REPO_URL, + ref_name="refs/remotes/origin/main", + old_sha=OLD_SHA, + new_sha=NEW_SHA, + commit_sha=commit_sha, + commit_range=f"{OLD_SHA}..{NEW_SHA}", + scanner_name=SCANNER_NAME, + scanner_version=SCANNER_VERSION, + rule_pack_version=RULE_PACK_VERSION, + scanner_config_hash=SCANNER_CONFIG_HASH, + priority=100, + status=status, + attempts=attempts, + max_attempts=max_attempts, + worker_id=worker_id, + lease_until=lease_until, + next_attempt_at=next_attempt_at, + created_at=NOW, + updated_at=NOW, + ) + + +def _make_ledger(job: ScanJob) -> ScanLedgerEntry: + return ScanLedgerEntry( + repo_id=job.repo_id, + commit_sha=job.commit_sha, + scanner_name=job.scanner_name, + scanner_version=job.scanner_version, + rule_pack_version=job.rule_pack_version, + scanner_config_hash=job.scanner_config_hash, + scan_run_id=f"scan_run_{job.job_id}", + job_id=job.job_id, + scanned_at=NOW + dt.timedelta(minutes=5), + finding_count=1, + ) + + +def _make_finding(job: ScanJob) -> Finding: + return Finding.create( + repo_full_name="example-org/example-repo", + repo_commit=job.commit_sha, + rule_id="generic-api-key", + file_path="src/config.py", + line_start=12, + raw_secret="synthetic-value-for-hash", + source_tool=job.scanner_name, + source_tool_version=job.scanner_version, + scan_run_id=f"scan_run_{job.job_id}", + rule_pack_version=job.rule_pack_version, + ) + + +def test_incremental_item_mappers_round_trip_all_entity_types(): + ref_state = RefState( + repo_id=REPO_ID, + repo_url=REPO_URL, + ref_name="refs/remotes/origin/main", + last_seen_sha=NEW_SHA, + updated_at=NOW, + ) + job = _make_job() + ledger = _make_ledger(job) + repo_lease = RepoLease( + repo_id=REPO_ID, + worker_id="worker-a", + lease_until=NOW + dt.timedelta(seconds=60), + updated_at=NOW, + ) + + assert ref_state_from_item(ref_state_to_item(ref_state)) == ref_state + assert scan_job_from_item(scan_job_to_item(job)) == job + assert scan_ledger_entry_from_item(scan_ledger_entry_to_item(ledger)) == ledger + assert repo_lease_from_item(repo_lease_to_item(repo_lease)) == repo_lease + + lease_item = repo_lease_to_item(repo_lease) + assert lease_item["gsi1pk"] == "REPO_LEASE#ALL" + assert lease_item["gsi1sk"] == ( + f"{datetime_to_iso(repo_lease.lease_until)}#{REPO_ID}" + ) + + +def test_repo_id_and_job_id_are_deterministic_from_contract_fields(): + normalized = "https://github.com/example-org/example-repo" + expected_repo_id = "repo_" + hashlib.sha256( + normalized.encode("utf-8") + ).hexdigest()[:24] + + assert repo_id_for_scan_target_url(f"{normalized}/") == expected_repo_id + assert repo_id_for_scan_target_url(f"{normalized}?tab=code") == expected_repo_id + + first_job_id = scan_job_id_for( + repo_id=REPO_ID, + commit_sha=COMMIT_SHA, + scanner_name=SCANNER_NAME, + scanner_version=SCANNER_VERSION, + rule_pack_version=RULE_PACK_VERSION, + scanner_config_hash=SCANNER_CONFIG_HASH, + ) + second_job_id = scan_job_id_for( + repo_id=REPO_ID, + commit_sha=COMMIT_SHA, + scanner_name=SCANNER_NAME, + scanner_version=SCANNER_VERSION, + rule_pack_version=RULE_PACK_VERSION, + scanner_config_hash=SCANNER_CONFIG_HASH, + ) + assert first_job_id == second_job_id + + different_config_job_id = scan_job_id_for( + repo_id=REPO_ID, + commit_sha=COMMIT_SHA, + scanner_name=SCANNER_NAME, + scanner_version=SCANNER_VERSION, + rule_pack_version=RULE_PACK_VERSION, + scanner_config_hash="different_config_hash", + ) + assert different_config_job_id != first_job_id + + +def test_enqueue_commit_scan_job_is_idempotent_and_skips_existing_ledger(): + store, table = _make_store() + job = _make_job() + + assert store.enqueue_commit_scan_job(job) is True + assert store.enqueue_commit_scan_job(job) is False + assert len([item for item in table.items if item["entityType"] == "SCAN_JOB"]) == 1 + + completed_job = _make_job(commit_sha="d" * 40) + table.put_item(Item=scan_ledger_entry_to_item(_make_ledger(completed_job))) + + assert store.enqueue_commit_scan_job(completed_job) is False + assert len([item for item in table.items if item["entityType"] == "SCAN_JOB"]) == 1 + + +def test_lease_next_scan_job_is_conditional_for_pending_jobs(): + store, _ = _make_store() + job = _make_job() + store.enqueue_commit_scan_job(job) + + leased = store.lease_next_scan_job( + worker_id="worker-a", + lease_seconds=60, + now=NOW, + ) + assert leased is not None + assert leased.status == "leased" + assert leased.worker_id == "worker-a" + + assert ( + store.lease_next_scan_job(worker_id="worker-b", lease_seconds=60, now=NOW) + is None + ) + + +def test_expired_job_lease_can_be_reclaimed(): + store, table = _make_store() + expired = _make_job( + status="leased", + worker_id="worker-a", + lease_until=NOW - dt.timedelta(seconds=1), + ) + table.put_item(Item=scan_job_to_item(expired)) + + reclaimed = store.lease_next_scan_job( + worker_id="worker-b", + lease_seconds=120, + now=NOW, + ) + + assert reclaimed is not None + assert reclaimed.worker_id == "worker-b" + assert reclaimed.lease_until == NOW + dt.timedelta(seconds=120) + + +def test_repo_lease_acquire_release_and_expired_count_are_conditional(): + store, table = _make_store() + + assert store.acquire_repo_lease(REPO_ID, "worker-a", lease_seconds=60) is True + assert store.acquire_repo_lease(REPO_ID, "worker-b", lease_seconds=60) is False + + store.release_repo_lease(REPO_ID, "worker-b") + assert table.get_item(Key={"PK": f"REPO_LEASE#{REPO_ID}", "SK": "META"}).get("Item") + + store.release_repo_lease(REPO_ID, "worker-a") + assert not table.get_item(Key={"PK": f"REPO_LEASE#{REPO_ID}", "SK": "META"}) + + expired = RepoLease( + repo_id=REPO_ID, + worker_id="worker-old", + lease_until=dt.datetime(2000, 1, 1, tzinfo=dt.UTC), + updated_at=dt.datetime(2000, 1, 1, tzinfo=dt.UTC), + ) + table.put_item(Item=repo_lease_to_item(expired)) + assert store.acquire_repo_lease(REPO_ID, "worker-c", lease_seconds=60) is True + + status = store.get_queue_status(now=dt.datetime(2100, 1, 1, tzinfo=dt.UTC)) + assert status.expired_repo_leases == 1 + + +def test_retryable_failure_returns_pending_then_attempts_exhaust_to_dead_letter(): + store, _ = _make_store() + job = _make_job(max_attempts=2) + store.enqueue_commit_scan_job(job) + leased = store.lease_next_scan_job("worker-a", lease_seconds=60, now=NOW) + assert leased is not None + + store.record_retryable_failure( + leased.job_id, + error="synthetic retryable failure", + next_attempt_at=NOW + dt.timedelta(minutes=5), + ) + retry_job = store.lease_next_scan_job( + "worker-b", + lease_seconds=60, + now=NOW + dt.timedelta(minutes=5), + ) + + assert retry_job is not None + assert retry_job.attempts == 1 + assert retry_job.last_error == "synthetic retryable failure" + + store.record_retryable_failure( + retry_job.job_id, + error="synthetic exhausted failure", + next_attempt_at=NOW + dt.timedelta(minutes=10), + ) + + status = store.get_queue_status(now=NOW + dt.timedelta(minutes=10)) + assert status.job_counts_by_status == {"dead_letter": 1} + + +def test_return_job_to_pending_does_not_increment_attempts(): + store, table = _make_store() + job = _make_job() + store.enqueue_commit_scan_job(job) + leased = store.lease_next_scan_job("worker-a", lease_seconds=60, now=NOW) + assert leased is not None + + store.return_job_to_pending(leased.job_id, reason="repo lease unavailable") + + returned_item = table.get_item( + Key={"PK": f"SCAN_JOB#{leased.job_id}", "SK": "META"} + )["Item"] + returned = scan_job_from_item(returned_item) + assert returned.status == "pending" + assert returned.attempts == 0 + assert returned.worker_id is None + assert returned.lease_until is None + assert returned.last_error == "repo lease unavailable" + + +def test_queue_status_counts_mixed_jobs_and_expired_leases(): + store, table = _make_store() + pending = _make_job(commit_sha="1" * 40) + leased_active = _make_job(commit_sha="2" * 40) + leased_expired = _make_job(commit_sha="3" * 40) + completed = _make_job(commit_sha="4" * 40) + dead_letter = _make_job(commit_sha="5" * 40) + + for job in ( + pending, + ScanJob( + **{ + **leased_active.__dict__, + "status": "leased", + "worker_id": "worker-active", + "lease_until": NOW + dt.timedelta(minutes=5), + } + ), + ScanJob( + **{ + **leased_expired.__dict__, + "status": "leased", + "worker_id": "worker-expired", + "lease_until": NOW - dt.timedelta(minutes=1), + } + ), + ScanJob(**{**completed.__dict__, "status": "completed"}), + ScanJob(**{**dead_letter.__dict__, "status": "dead_letter"}), + ): + table.put_item(Item=scan_job_to_item(job)) + + table.put_item( + Item=repo_lease_to_item( + RepoLease( + repo_id=REPO_ID, + worker_id="worker-active", + lease_until=NOW + dt.timedelta(minutes=5), + updated_at=NOW, + ) + ) + ) + table.put_item( + Item=repo_lease_to_item( + RepoLease( + repo_id="repo_synthetic000000000002", + worker_id="worker-expired", + lease_until=NOW - dt.timedelta(minutes=1), + updated_at=NOW, + ) + ) + ) + + status = store.get_queue_status(now=NOW) + + assert status.job_counts_by_status == { + "pending": 1, + "leased": 2, + "completed": 1, + "dead_letter": 1, + } + assert status.expired_job_leases == 1 + assert status.expired_repo_leases == 1 + + +def test_complete_processed_job_writes_findings_ledger_then_completed_job(): + store, table = _make_store() + job = _make_job() + store.enqueue_commit_scan_job(job) + leased = store.lease_next_scan_job("worker-a", lease_seconds=60, now=NOW) + assert leased is not None + ledger = _make_ledger(leased) + finding = _make_finding(leased) + + before = len(table.put_calls) + store.complete_processed_job(leased, findings=[finding], ledger=ledger) + written_types = [item["entityType"] for item in table.put_calls[before:]] + + assert written_types == [ + "FINDING", + "FINDING_OBSERVATION", + "FINDING_STATE", + "SCAN_LEDGER", + "SCAN_JOB", + ] + assert store.has_scan_ledger(ledger.key) is True + status = store.get_queue_status(now=NOW + dt.timedelta(minutes=10)) + assert status.job_counts_by_status == {"completed": 1} + + +def test_ledger_present_leased_job_completes_without_rewriting_findings(): + store, table = _make_store() + job = _make_job() + store.enqueue_commit_scan_job(job) + leased = store.lease_next_scan_job("worker-a", lease_seconds=60, now=NOW) + assert leased is not None + ledger = _make_ledger(leased) + table.put_item(Item=scan_ledger_entry_to_item(ledger)) + + before = len(table.put_calls) + store.complete_processed_job(leased, findings=[_make_finding(leased)], ledger=ledger) + written_types = [item["entityType"] for item in table.put_calls[before:]] + + assert written_types == ["SCAN_JOB"] + status = store.get_queue_status(now=NOW + dt.timedelta(minutes=10)) + assert status.job_counts_by_status == {"completed": 1} diff --git a/tests/test_manifest.py b/tests/test_manifest.py index 6249359..e429a99 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -144,6 +144,22 @@ def test_scan_defaults_when_section_absent(tmp_path: Path) -> None: assert m.scan.exclude == [] +def test_scan_git_log_opts_is_optional_string(tmp_path: Path) -> None: + yaml_content = """\ + version: 1 + targets: + - name: demo-commit + path: /tmp/demo-commit + scan: + include_history: true + git_log_opts: abc123^! + """ + p = write_yaml(tmp_path, yaml_content) + m = load_manifest(p) + + assert m.scan.git_log_opts == "abc123^!" + + def test_scan_enable_noise_filter_can_be_disabled(tmp_path: Path) -> None: yaml_content = """\ version: 1 @@ -161,6 +177,21 @@ def test_scan_enable_noise_filter_can_be_disabled(tmp_path: Path) -> None: assert m.scan.enable_noise_filter is False +def test_scan_git_log_opts_non_string_raises(tmp_path: Path) -> None: + yaml_content = """\ + version: 1 + targets: + - name: demo-bad-log-opts + path: /tmp/demo-bad-log-opts + scan: + git_log_opts: 123 + """ + p = write_yaml(tmp_path, yaml_content) + + with pytest.raises(ManifestError, match="scan.git_log_opts"): + load_manifest(p) + + def test_gitleaks_config_non_empty(tmp_path: Path) -> None: """A non-empty gitleaks_config string is preserved.""" yaml_content = """\ diff --git a/tests/test_queue_status.py b/tests/test_queue_status.py new file mode 100644 index 0000000..09dcaf0 --- /dev/null +++ b/tests/test_queue_status.py @@ -0,0 +1,127 @@ +"""Tests for queue-status runtime and CLI.""" + +from __future__ import annotations + +import datetime as dt + +from security_scanner.cli import main +from security_scanner.runtime.queue_status import ( + QueueStatusRequest, + read_queue_status, + render_queue_status, +) +from security_scanner.storage.base import QueueStatus + + +NOW = dt.datetime(2026, 6, 12, 14, 0, tzinfo=dt.UTC) + + +class FakeQueueStatusStore: + def __init__(self, status: QueueStatus) -> None: + self.status = status + self.now_values: list[dt.datetime] = [] + + def get_queue_status(self, now: dt.datetime) -> QueueStatus: + self.now_values.append(now) + return self.status + + +def test_read_queue_status_passes_utc_now_to_store(): + status = QueueStatus( + job_counts_by_status={"pending": 2}, + expired_job_leases=1, + expired_repo_leases=0, + ) + store = FakeQueueStatusStore(status) + + observed = read_queue_status( + QueueStatusRequest(store=store, now_factory=lambda: NOW.replace(tzinfo=None)) + ) + + assert observed == status + assert store.now_values == [NOW] + + +def test_render_queue_status_groups_jobs_and_expired_leases(): + rendered = render_queue_status( + QueueStatus( + job_counts_by_status={ + "pending": 2, + "leased": 1, + "completed": 3, + "dead_letter": 4, + }, + expired_job_leases=5, + expired_repo_leases=6, + ) + ) + + assert rendered == ( + "pending: 2\n" + "leased: 1\n" + "completed: 3\n" + "dead_letter: 4\n" + "expired job leases: 5\n" + "expired repo leases: 6\n" + ) + + +def test_render_queue_status_includes_zeroes_for_missing_known_statuses(): + rendered = render_queue_status( + QueueStatus( + job_counts_by_status={"pending": 1}, + expired_job_leases=0, + expired_repo_leases=0, + ) + ) + + assert "leased: 0" in rendered + assert "completed: 0" in rendered + assert "dead_letter: 0" in rendered + + +def test_queue_status_cli_defaults_to_dynamodb(monkeypatch, capsys): + store = FakeQueueStatusStore( + QueueStatus( + job_counts_by_status={"pending": 2, "completed": 1}, + expired_job_leases=1, + expired_repo_leases=0, + ) + ) + monkeypatch.setattr( + "security_scanner.cli.app.create_finding_store", + lambda backend, **kwargs: store, + ) + + exit_code = main(["queue-status"]) + + captured = capsys.readouterr() + assert exit_code == 0 + assert "pending: 2" in captured.out + assert "completed: 1" in captured.out + assert "expired job leases: 1" in captured.out + + +def test_queue_status_rejects_jsonl_backend(capsys): + exit_code = main(["queue-status", "--storage-backend", "jsonl"]) + + captured = capsys.readouterr() + assert exit_code == 2 + assert "dynamodb only" in captured.err + + +def test_queue_status_storage_failure_exits_one(monkeypatch, capsys): + class BrokenStore: + def get_queue_status(self, now: dt.datetime) -> QueueStatus: + raise RuntimeError("synthetic queue status failure") + + monkeypatch.setattr( + "security_scanner.cli.app.create_finding_store", + lambda backend, **kwargs: BrokenStore(), + ) + + exit_code = main(["queue-status"]) + + captured = capsys.readouterr() + assert exit_code == 1 + assert "synthetic queue status failure" in captured.err diff --git a/tests/test_quickstart.py b/tests/test_quickstart.py new file mode 100644 index 0000000..f64ef26 --- /dev/null +++ b/tests/test_quickstart.py @@ -0,0 +1,214 @@ +"""Tests for turnkey quickstart runtime.""" + +from __future__ import annotations + +import datetime as dt +from collections import Counter +from pathlib import Path +from typing import Sequence + +from security_scanner.catalog.scan_target import ScanTarget +from security_scanner.core.finding.model import Finding +from security_scanner.runtime.incremental_discovery import DiscoveryScannerConfig, GitRef +from security_scanner.runtime.quickstart import QuickstartRequest, run_quickstart +from security_scanner.runtime.scan_worker import ScanWorkerRequest +from security_scanner.storage.base import ( + QueueStatus, + RefState, + RepoLease, + ScanJob, + ScanLedgerEntry, + ScanLedgerKey, +) + + +NOW = dt.datetime(2026, 6, 12, 16, 0, tzinfo=dt.UTC) +TARGET_URL = "https://github.com/example-org/example-repo" +COMMIT_SHA = "9" * 40 + + +class QuickstartStore: + def __init__(self) -> None: + self.bootstrap_calls = 0 + self.targets: list[ScanTarget] = [] + self.ref_states: dict[tuple[str, str], RefState] = {} + self.jobs: dict[str, ScanJob] = {} + self.ledger: dict[ScanLedgerKey, ScanLedgerEntry] = {} + self.repo_leases: dict[str, RepoLease] = {} + + def bootstrap(self) -> None: + self.bootstrap_calls += 1 + + def put_scan_target(self, target: ScanTarget) -> None: + self.targets = [existing for existing in self.targets if existing.url != target.url] + self.targets.append(target) + + def list_scan_targets(self) -> list[ScanTarget]: + return list(self.targets) + + def get_ref_state(self, repo_id: str, ref_name: str) -> RefState | None: + return self.ref_states.get((repo_id, ref_name)) + + def put_ref_state(self, state: RefState) -> None: + self.ref_states[(state.repo_id, state.ref_name)] = state + + def has_scan_ledger(self, key: ScanLedgerKey) -> bool: + return key in self.ledger + + def enqueue_commit_scan_job(self, job: ScanJob) -> bool: + if job.job_id in self.jobs: + return False + self.jobs[job.job_id] = job + return True + + def lease_next_scan_job( + self, + worker_id: str, + lease_seconds: int, + now: dt.datetime, + ) -> ScanJob | None: + for job in self.jobs.values(): + if job.status != "pending": + continue + leased = ScanJob( + **{ + **job.__dict__, + "status": "leased", + "worker_id": worker_id, + "lease_until": now + dt.timedelta(seconds=lease_seconds), + } + ) + self.jobs[job.job_id] = leased + return leased + return None + + def complete_processed_job( + self, + job: ScanJob, + findings: Sequence[Finding], + ledger: ScanLedgerEntry, + ) -> None: + self.ledger[ledger.key] = ledger + self.jobs[job.job_id] = ScanJob( + **{ + **job.__dict__, + "status": "completed", + "worker_id": None, + "lease_until": None, + } + ) + + def record_retryable_failure( + self, + job_id: str, + error: str, + next_attempt_at: dt.datetime, + ) -> None: + raise AssertionError(error) + + def return_job_to_pending(self, job_id: str, reason: str) -> None: + raise AssertionError(reason) + + def acquire_repo_lease(self, repo_id: str, worker_id: str, lease_seconds: int) -> bool: + self.repo_leases[repo_id] = RepoLease( + repo_id=repo_id, + worker_id=worker_id, + lease_until=NOW + dt.timedelta(seconds=lease_seconds), + updated_at=NOW, + ) + return True + + def release_repo_lease(self, repo_id: str, worker_id: str) -> None: + self.repo_leases.pop(repo_id, None) + + def get_queue_status(self, now: dt.datetime) -> QueueStatus: + return QueueStatus( + job_counts_by_status=dict(Counter(job.status for job in self.jobs.values())), + expired_job_leases=0, + expired_repo_leases=0, + ) + + +class FakeGit: + def __init__(self, repo_path: Path) -> None: + self.repo_path = repo_path + + def fetch(self, repo_path: Path) -> None: + return None + + def list_remote_refs(self, repo_path: Path, patterns) -> list[GitRef]: + assert repo_path == self.repo_path + return [GitRef(ref_name="refs/remotes/origin/main", commit_sha=COMMIT_SHA)] + + def is_ancestor(self, repo_path: Path, old_sha: str, new_sha: str) -> bool: + return True + + def list_new_commits(self, repo_path: Path, old_sha: str, new_sha: str) -> list[str]: + return [] + + +class FakeScanner: + def scan(self, **kwargs) -> list[Finding]: + return [ + Finding.create( + repo_full_name=kwargs["repo_full_name"], + rule_id="generic-api-key", + file_path="src/config.py", + line_start=1, + raw_secret="synthetic-value-for-hash", + source_tool="gitleaks", + scan_run_id=kwargs["scan_run_id"], + rule_pack_version=kwargs["rule_pack_version"], + ) + ] + + +def test_quickstart_bootstraps_enqueues_current_tip_and_runs_worker(tmp_path): + repo_path = tmp_path / "repo" + repo_path.mkdir() + store = QuickstartStore() + store.put_scan_target( + ScanTarget( + url="https://github.com/example-org/other-repo", + name="example-org/other-repo", + enabled=True, + ) + ) + + def make_worker_request(max_jobs: int) -> ScanWorkerRequest: + return ScanWorkerRequest( + store=store, + fetch_repo=lambda url: repo_path, + scanner=FakeScanner(), + max_jobs=max_jobs, + worker_id="quickstart-test", + now_factory=lambda: NOW, + ) + + summary = run_quickstart( + QuickstartRequest( + target_url=TARGET_URL, + target_name=None, + store=store, + fetch_repo=lambda url: repo_path, + git=FakeGit(repo_path), + scanner_config=DiscoveryScannerConfig( + scanner_name="gitleaks", + scanner_version="unknown", + rule_pack_version="secret-rules-0.1.0", + scanner_config_hash="default", + ), + worker_request_factory=make_worker_request, + now_factory=lambda: NOW, + ) + ) + + assert store.bootstrap_calls == 1 + assert summary.initialized.refs_observed == 1 + assert summary.target_url == TARGET_URL + assert summary.current_jobs_enqueued == 1 + assert summary.enqueued.jobs_enqueued == 0 + assert summary.worker is not None + assert summary.worker.leased == 1 + assert summary.worker.completed == 1 + assert summary.status.job_counts_by_status == {"completed": 1} diff --git a/tests/test_scan_worker.py b/tests/test_scan_worker.py new file mode 100644 index 0000000..e0bf197 --- /dev/null +++ b/tests/test_scan_worker.py @@ -0,0 +1,296 @@ +"""Tests for incremental scan-worker runtime.""" + +from __future__ import annotations + +import datetime as dt +from pathlib import Path + +from security_scanner.core.finding.model import Finding +from security_scanner.core.scan.options import ScanOptions +from security_scanner.runtime.scan_worker import ( + ScanWorkerRequest, + run_scan_worker_once, +) +from security_scanner.storage.base import ScanJob, ScanLedgerEntry, ScanLedgerKey + + +NOW = dt.datetime(2026, 6, 12, 12, 0, tzinfo=dt.UTC) +REPO_ID = "repo_synthetic000000000001" +REPO_URL = "https://github.com/example-org/example-repo" +COMMIT_SHA = "a" * 40 + + +class FakeWorkerStore: + def __init__(self, jobs: list[ScanJob] | None = None) -> None: + self.jobs = list(jobs or []) + self.ledger_keys: set[ScanLedgerKey] = set() + self.completed: list[tuple[ScanJob, list[Finding], ScanLedgerEntry]] = [] + self.retry_failures: list[tuple[str, str, dt.datetime]] = [] + self.pending_returns: list[tuple[str, str]] = [] + self.repo_lease_available = True + self.repo_lease_calls: list[tuple[str, str, int]] = [] + self.repo_release_calls: list[tuple[str, str]] = [] + self.lease_calls = 0 + + def lease_next_scan_job( + self, + worker_id: str, + lease_seconds: int, + now: dt.datetime, + ) -> ScanJob | None: + self.lease_calls += 1 + if not self.jobs: + return None + job = self.jobs.pop(0) + return ScanJob( + **{ + **job.__dict__, + "status": "leased", + "worker_id": worker_id, + "lease_until": now + dt.timedelta(seconds=lease_seconds), + } + ) + + def has_scan_ledger(self, key: ScanLedgerKey) -> bool: + return key in self.ledger_keys + + def acquire_repo_lease( + self, + repo_id: str, + worker_id: str, + lease_seconds: int, + ) -> bool: + self.repo_lease_calls.append((repo_id, worker_id, lease_seconds)) + return self.repo_lease_available + + def release_repo_lease(self, repo_id: str, worker_id: str) -> None: + self.repo_release_calls.append((repo_id, worker_id)) + + def complete_processed_job( + self, + job: ScanJob, + findings, + ledger: ScanLedgerEntry, + ) -> None: + self.completed.append((job, list(findings), ledger)) + self.ledger_keys.add(ledger.key) + + def record_retryable_failure( + self, + job_id: str, + error: str, + next_attempt_at: dt.datetime, + ) -> None: + self.retry_failures.append((job_id, error, next_attempt_at)) + + def return_job_to_pending(self, job_id: str, reason: str) -> None: + self.pending_returns.append((job_id, reason)) + + +class FakeScanner: + def __init__( + self, + findings: list[Finding] | None = None, + error: Exception | None = None, + ) -> None: + self.findings = findings or [] + self.error = error + self.calls: list[dict] = [] + + def scan(self, **kwargs) -> list[Finding]: + self.calls.append(kwargs) + if self.error is not None: + raise self.error + return list(self.findings) + + +def _job(*, attempts: int = 0, max_attempts: int = 3) -> ScanJob: + return ScanJob( + job_id="scan_job_synthetic", + repo_id=REPO_ID, + repo_url=REPO_URL, + ref_name="refs/remotes/origin/main", + old_sha="0" * 40, + new_sha=COMMIT_SHA, + commit_sha=COMMIT_SHA, + commit_range=f"{'0' * 40}..{COMMIT_SHA}", + scanner_name="gitleaks", + scanner_version="unknown", + rule_pack_version="secret-rules-0.1.0", + scanner_config_hash="default", + priority=100, + status="pending", + attempts=attempts, + max_attempts=max_attempts, + worker_id=None, + lease_until=None, + next_attempt_at=NOW, + created_at=NOW, + updated_at=NOW, + ) + + +def _job_with_id(job_id: str, commit_sha: str) -> ScanJob: + job = _job() + return ScanJob( + **{ + **job.__dict__, + "job_id": job_id, + "commit_sha": commit_sha, + "new_sha": commit_sha, + "commit_range": f"{'0' * 40}..{commit_sha}", + } + ) + + +def _finding(commit: str | None = None) -> Finding: + return Finding.create( + repo_full_name=REPO_ID, + repo_commit=commit, + rule_id="generic-api-key", + file_path="src/config.py", + line_start=10, + raw_secret="synthetic-value-for-hash", + source_tool="gitleaks", + scan_run_id="scan_run_scan_job_synthetic", + rule_pack_version="secret-rules-0.1.0", + ) + + +def _request(store: FakeWorkerStore, scanner: FakeScanner, fetch_repo=None): + return ScanWorkerRequest( + store=store, + fetch_repo=fetch_repo or (lambda url: Path("/synthetic-cache/example-repo")), + scanner=scanner, + max_jobs=1, + lease_seconds=60, + worker_id="worker-a", + now_factory=lambda: NOW, + ) + + +def test_empty_queue_exits_without_work(): + store = FakeWorkerStore() + scanner = FakeScanner() + + summary = run_scan_worker_once(_request(store, scanner)) + + assert summary.leased == 0 + assert summary.completed == 0 + assert scanner.calls == [] + + +def test_one_pending_job_is_scanned_and_completed_with_commit_log_opts(): + finding = _finding(commit=None) + store = FakeWorkerStore([_job()]) + scanner = FakeScanner(findings=[finding]) + + summary = run_scan_worker_once(_request(store, scanner)) + + assert summary.leased == 1 + assert summary.completed == 1 + assert summary.retryable == 0 + assert summary.dead_lettered == 0 + call = scanner.calls[0] + assert call["root"] == Path("/synthetic-cache/example-repo") + assert call["repo_full_name"] == REPO_ID + assert call["scan_run_id"] == "scan_run_scan_job_synthetic" + assert call["rule_pack_version"] == "secret-rules-0.1.0" + assert call["scan_options"] == ScanOptions( + include_history=True, + git_log_opts=f"{COMMIT_SHA}^!", + ) + completed_job, findings, ledger = store.completed[0] + assert completed_job.commit_sha == COMMIT_SHA + assert findings[0].repo.commit == COMMIT_SHA + assert ledger.commit_sha == COMMIT_SHA + assert ledger.finding_count == 1 + assert store.repo_release_calls == [(REPO_ID, "worker-a")] + + +def test_repo_lease_failure_returns_job_to_pending_without_scanner_or_attempt(): + store = FakeWorkerStore([_job()]) + store.repo_lease_available = False + scanner = FakeScanner() + + summary = run_scan_worker_once(_request(store, scanner)) + + assert summary.leased == 1 + assert summary.completed == 0 + assert scanner.calls == [] + assert store.pending_returns == [("scan_job_synthetic", "repo lease unavailable")] + assert store.retry_failures == [] + assert store.repo_release_calls == [] + + +def test_repo_lease_failure_stops_current_once_loop(): + first = _job_with_id("scan_job_first", "1" * 40) + second = _job_with_id("scan_job_second", "2" * 40) + store = FakeWorkerStore([first, second]) + store.repo_lease_available = False + scanner = FakeScanner() + request = ScanWorkerRequest( + store=store, + fetch_repo=lambda url: Path("/synthetic-cache/example-repo"), + scanner=scanner, + max_jobs=2, + lease_seconds=60, + worker_id="worker-a", + now_factory=lambda: NOW, + ) + + summary = run_scan_worker_once(request) + + assert summary.leased == 1 + assert store.lease_calls == 1 + assert store.pending_returns == [("scan_job_first", "repo lease unavailable")] + assert scanner.calls == [] + + +def test_scanner_failure_records_retryable_failure_and_releases_repo_lease(): + store = FakeWorkerStore([_job(attempts=0, max_attempts=3)]) + scanner = FakeScanner(error=RuntimeError("synthetic scanner failure")) + + summary = run_scan_worker_once(_request(store, scanner)) + + assert summary.retryable == 1 + assert summary.dead_lettered == 0 + job_id, error, next_attempt_at = store.retry_failures[0] + assert job_id == "scan_job_synthetic" + assert "synthetic scanner failure" in error + assert next_attempt_at == NOW + dt.timedelta(seconds=60) + assert store.repo_release_calls == [(REPO_ID, "worker-a")] + + +def test_attempts_exhausted_is_reported_as_dead_letter(): + store = FakeWorkerStore([_job(attempts=2, max_attempts=3)]) + scanner = FakeScanner(error=RuntimeError("synthetic terminal failure")) + + summary = run_scan_worker_once(_request(store, scanner)) + + assert summary.retryable == 0 + assert summary.dead_lettered == 1 + assert summary.has_permanent_failure is True + assert store.retry_failures[0][0] == "scan_job_synthetic" + + +def test_ledger_present_job_completes_without_fetching_or_scanning(): + job = _job() + store = FakeWorkerStore([job]) + store.ledger_keys.add(job.ledger_key) + scanner = FakeScanner() + fetch_calls: list[str] = [] + + summary = run_scan_worker_once( + _request( + store, + scanner, + fetch_repo=lambda url: (fetch_calls.append(url), Path("/unused"))[1], + ) + ) + + assert summary.completed == 1 + assert scanner.calls == [] + assert fetch_calls == [] + assert store.repo_lease_calls == [] + assert store.completed[0][2].job_id == job.job_id