Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docs/views/research-and-technical-decisions.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,14 @@
## 공개 문서에 남길 수 있는 범위

공개 문서는 tool role과 decision rationale만 설명합니다. 비공개 benchmark data, 민감한 alert data, internal repository context, private provider endpoint는 제외합니다.

## 노이즈 필터 위치 결정

| 필터 위치 후보 | 장점 | 단점 | 선택 여부 |
| --- | --- | --- | --- |
| **Gitleaks Parser 단** | - 불필요한 노이즈가 조기에 필터링되어 스토리지 저장 및 검증 비용 절감<br>- 파싱 단계에서 간결하게 스키마 맵핑 전 걸러낼 수 있음 | - 원본 Gitleaks 레포트에 어떤 노이즈가 포함되었는지 기록이 남지 않음 (로그로만 남음) | **선택** |
| **NoSQL Storage 저장 단** | - 원본 파싱 데이터는 유지할 수 있고, 저장 시점에 선택적 필터링 가능 | - 저장소 로직이 무거워지고 불필요한 파싱 데이터 객체 생성이 발생함 | 미선택 |
| **LLM Verifier 단** | - LLM의 컨텍스트를 활용한 고차원 필터링 가능 | - API 호출 비용 및 latency 증가, 비용 낭비 발생 | 미선택 |

### 선택 이유
Gitleaks 결과 파싱 단계에서 노이즈를 걸러냄으로써 불필요한 NoSQL Storage DB 쓰기 및 LLM Verifier 검증 비용을 최소화하고, 스캐닝 파이프라인의 효율성을 극대화하기 위해 선택했습니다.
6 changes: 6 additions & 0 deletions src/security_scanner/core/scan/options.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,13 @@ class ScanOptions:
Glob patterns for paths to exclude from the scan.
NOTE(phase2): These are accepted but not yet applied to gitleaks
command args — future milestone will map them to gitleaks allowlist flags.
enable_noise_filter:
When True (default), parser-level Gitleaks noise filtering removes
low-signal candidates before storage and optional verifier steps.
When False, all Gitleaks report items that map successfully are passed
through, which may increase false positives and output volume.
"""

include_history: bool = True
exclude: list[str] = field(default_factory=list)
enable_noise_filter: bool = True
94 changes: 94 additions & 0 deletions src/security_scanner/scanners/gitleaks/filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""Noise filter implementation for Gitleaks findings."""

from __future__ import annotations

import math
import re
from collections import Counter

# Template placeholder patterns (e.g., ${VAR}, {{secret}}, <VAR>, [VAR], %VAR%, __VAR__)
TEMPLATE_PATTERN = re.compile(
r"^("
r"\$\{[a-zA-Z0-9_-]+\}"
r"|\{\{[a-zA-Z0-9_-]+\}\}"
r"|<[a-zA-Z0-9_-]+>"
r"|\[[a-zA-Z0-9_-]+\]"
r"|%[a-zA-Z0-9_-]+%"
r"|__[a-zA-Z0-9_-]+__"
r")$"
)

# Known dummy values (case-insensitive)
KNOWN_DUMMY_VALUES = {
"your_api_key",
"changeme",
"insert-token-here",
}

# False-negative prevention patterns (synthetic AWS/GitHub token shapes).
FALSE_NEGATIVE_PATTERN = re.compile(r"^(AKIA[A-Z0-9]{16}|ghp_[a-zA-Z0-9]{36,})$")


def calculate_entropy(s: str) -> float:
"""Calculate the Shannon Entropy of a string."""
if not s:
return 0.0
total_len = len(s)
counts = Counter(s)
entropy = 0.0
for count in counts.values():
p = count / total_len
entropy -= p * math.log2(p)
return entropy


def noise_reason(item: dict) -> str | None:
"""Return the noise reason for a Gitleaks item, or None when it should pass.

Parameters
----------
item : dict
A single Gitleaks JSON finding item (containing 'Secret', 'Match', etc.)

Returns
-------
str | None
A non-sensitive reason string when the item is classified as noise.
None when the item should not be filtered.
"""
secret = item.get("Secret", "")
if not isinstance(secret, str) or not secret:
return "empty-secret"
Comment thread
pureliture marked this conversation as resolved.

# 1. False-Negative Prevention
if FALSE_NEGATIVE_PATTERN.match(secret):
return None

# 2. Template placeholders
if TEMPLATE_PATTERN.match(secret):
return "template-placeholder"

# 3. Known dummy values (case-insensitive)
if secret.lower() in KNOWN_DUMMY_VALUES:
return "known-dummy-value"

# 4. Repeated characters
if len(secret) >= 1 and len(set(secret)) == 1:
return "repeated-character"

# 5. Low entropy & short strings
if len(secret) <= 5:
return "short-secret"

entropy = calculate_entropy(secret)
if len(secret) < 10 and entropy < 1.8:
return "low-entropy-short-secret"
if len(secret) >= 10 and entropy < 2.5:
return "low-entropy-secret"

return None


def should_filter_item(item: dict) -> bool:
"""Determine if a Gitleaks finding item should be filtered out as noise."""
return noise_reason(item) is not None
17 changes: 17 additions & 0 deletions src/security_scanner/scanners/gitleaks/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from pathlib import Path

from security_scanner.core.finding.model import Finding
from security_scanner.core.scan.options import ScanOptions
from security_scanner.scanners.gitleaks.filter import noise_reason
from security_scanner.scanners.gitleaks.mapper import map_gitleaks_item


Expand All @@ -25,6 +27,7 @@ def parse_gitleaks_report(
rule_pack_version: str,
source_root: Path | None = None,
source_tool: str = "gitleaks",
scan_options: ScanOptions | None = None,
) -> list[Finding]:
"""Parse a Gitleaks JSON report string into core Finding objects."""
if not raw_json or not raw_json.strip():
Expand All @@ -47,10 +50,24 @@ def parse_gitleaks_report(
return []

findings: list[Finding] = []
enable_noise_filter = scan_options.enable_noise_filter if scan_options is not None else True

for index, item in enumerate(data):
if not isinstance(item, dict):
logger.warning("GitleaksParser: skipping non-dict item at index %d", index)
continue

reason = noise_reason(item) if enable_noise_filter else None
if reason is not None:
logger.debug(
"GitleaksParser: filtering out noise item at index %d "
"for rule %s: %s",
index,
item.get("RuleID", "<unknown>"),
reason,
)
continue

finding = map_gitleaks_item(
item,
repo_full_name=repo_full_name,
Expand Down
1 change: 1 addition & 0 deletions src/security_scanner/scanners/gitleaks/scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,4 +114,5 @@ def scan(
rule_pack_version=rule_pack_version,
source_root=root,
source_tool=self.name,
scan_options=scan_options,
)
12 changes: 11 additions & 1 deletion src/security_scanner/targets/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,17 @@ def _parse_scan(raw: Any) -> ScanOptions:

include_history = raw.get("include_history", True)
exclude = raw.get("exclude", [])
enable_noise_filter = raw.get("enable_noise_filter", True)

if not isinstance(include_history, bool):
raise ManifestError(
f"scan.include_history must be a boolean, got {include_history!r}"
)
if not isinstance(enable_noise_filter, bool):
raise ManifestError(
"scan.enable_noise_filter must be a boolean, "
f"got {enable_noise_filter!r}"
)
if not isinstance(exclude, list):
raise ManifestError(
f"scan.exclude must be a list, got {type(exclude).__name__}"
Expand All @@ -126,7 +132,11 @@ def _parse_scan(raw: Any) -> ScanOptions:
f"scan.exclude[{i}] must be a string, got {pattern!r}"
)

return ScanOptions(include_history=include_history, exclude=list(exclude))
return ScanOptions(
include_history=include_history,
exclude=list(exclude),
enable_noise_filter=enable_noise_filter,
)


def load_manifest(path: str | Path) -> Manifest:
Expand Down
100 changes: 100 additions & 0 deletions tests/test_gitleaks_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""Unit tests for Gitleaks noise filter."""

from __future__ import annotations

from security_scanner.scanners.gitleaks.filter import (
calculate_entropy,
noise_reason,
should_filter_item,
)


FAKE_AWS_ACCESS_KEY_ID = "AKIAFAKEEXAMPLE00000"
FAKE_GITHUB_TOKEN = "ghp_FAKEtoken123456789012345678901234567"


def test_template_placeholders():
placeholders = [
"${VAR}",
"{{secret}}",
"<VAR>",
"[VAR]",
"%VAR%",
"__VAR__",
"${SOME_ENV_VARIABLE}",
"{{database_password}}",
]
for ph in placeholders:
assert should_filter_item({"Secret": ph}) is True


def test_known_dummy_values():
dummies = [
"your_api_key",
"YOUR_API_KEY",
"CHANGEME",
"changeme",
"insert-token-here",
"Insert-Token-Here",
]
for dummy in dummies:
assert should_filter_item({"Secret": dummy}) is True


def test_repeated_characters():
repeated = [
"xxxxxx",
"aaaaaa",
"11111",
"ZZZZZZZZ",
]
for rep in repeated:
assert should_filter_item({"Secret": rep}) is True


def test_low_entropy_and_short_strings():
# Length <= 5: always filtered
assert should_filter_item({"Secret": ""}) is True
assert should_filter_item({"Secret": "abcd"}) is True
assert should_filter_item({"Secret": "12345"}) is True

# Length < 10 and entropy < 1.8: filtered
# "1231231" has length 7, entropy is 1.556 < 1.8
assert should_filter_item({"Secret": "1231231"}) is True

# Length < 10 and entropy >= 1.8: NOT filtered
# "abcdefg" has length 7, entropy is 2.807 >= 1.8
assert should_filter_item({"Secret": "abcdefg"}) is False

# Length >= 10 but low entropy: filtered
assert should_filter_item({"Secret": "aaaaabbbbb"}) is True

# Length >= 10 with enough entropy: NOT filtered
assert should_filter_item({"Secret": "abcdefghi0"}) is False


def test_false_negatives_prevention():
# AWS Access Key format (typically 20 chars, starting with AKIA)
assert should_filter_item({"Secret": FAKE_AWS_ACCESS_KEY_ID}) is False
# GitHub Token format (typically 40 chars, starting with ghp_)
assert should_filter_item({"Secret": FAKE_GITHUB_TOKEN}) is False


def test_noise_reason_does_not_include_secret_value():
secret = "${DATABASE_PASSWORD}"
reason = noise_reason({"Secret": secret})

assert reason == "template-placeholder"
assert secret not in reason


def test_noise_reason_handles_non_string_secret_values():
for secret in (None, 123, True, [], {}):
assert noise_reason({"Secret": secret}) == "empty-secret"
assert should_filter_item({"Secret": secret}) is True


def test_calculate_entropy():
assert calculate_entropy("") == 0.0
assert calculate_entropy("a") == 0.0
assert abs(calculate_entropy("ab") - 1.0) < 1e-9
Loading
Loading