source-security-dev · pureliture · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/docs/views/research-and-technical-decisions.md b/docs/views/research-and-technical-decisions.md
@@ -33,3 +33,14 @@
 ## 공개 문서에 남길 수 있는 범위
 
 공개 문서는 tool role과 decision rationale만 설명합니다. 비공개 benchmark data, 민감한 alert data, internal repository context, private provider endpoint는 제외합니다.
+
+## 노이즈 필터 위치 결정
+
+| 필터 위치 후보 | 장점 | 단점 | 선택 여부 |
+| --- | --- | --- | --- |
+| **Gitleaks Parser 단** | - 불필요한 노이즈가 조기에 필터링되어 스토리지 저장 및 검증 비용 절감<br>- 파싱 단계에서 간결하게 스키마 맵핑 전 걸러낼 수 있음 | - 원본 Gitleaks 레포트에 어떤 노이즈가 포함되었는지 기록이 남지 않음 (로그로만 남음) | **선택** |
+| **NoSQL Storage 저장 단** | - 원본 파싱 데이터는 유지할 수 있고, 저장 시점에 선택적 필터링 가능 | - 저장소 로직이 무거워지고 불필요한 파싱 데이터 객체 생성이 발생함 | 미선택 |
+| **LLM Verifier 단** | - LLM의 컨텍스트를 활용한 고차원 필터링 가능 | - API 호출 비용 및 latency 증가, 비용 낭비 발생 | 미선택 |
+
+### 선택 이유
+Gitleaks 결과 파싱 단계에서 노이즈를 걸러냄으로써 불필요한 NoSQL Storage DB 쓰기 및 LLM Verifier 검증 비용을 최소화하고, 스캐닝 파이프라인의 효율성을 극대화하기 위해 선택했습니다.
diff --git a/src/security_scanner/core/scan/options.py b/src/security_scanner/core/scan/options.py
@@ -24,7 +24,13 @@ class ScanOptions:
         Glob patterns for paths to exclude from the scan.
         NOTE(phase2): These are accepted but not yet applied to gitleaks
         command args — future milestone will map them to gitleaks allowlist flags.
+    enable_noise_filter:
+        When True (default), parser-level Gitleaks noise filtering removes
+        low-signal candidates before storage and optional verifier steps.
+        When False, all Gitleaks report items that map successfully are passed
+        through, which may increase false positives and output volume.
     """
 
     include_history: bool = True
     exclude: list[str] = field(default_factory=list)
+    enable_noise_filter: bool = True
diff --git a/src/security_scanner/scanners/gitleaks/filter.py b/src/security_scanner/scanners/gitleaks/filter.py
@@ -0,0 +1,94 @@
+"""Noise filter implementation for Gitleaks findings."""
+
+from __future__ import annotations
+
+import math
+import re
+from collections import Counter
+
+# Template placeholder patterns (e.g., ${VAR}, {{secret}}, <VAR>, [VAR], %VAR%, __VAR__)
+TEMPLATE_PATTERN = re.compile(
+    r"^("
+    r"\$\{[a-zA-Z0-9_-]+\}"
+    r"|\{\{[a-zA-Z0-9_-]+\}\}"
+    r"|<[a-zA-Z0-9_-]+>"
+    r"|\[[a-zA-Z0-9_-]+\]"
+    r"|%[a-zA-Z0-9_-]+%"
+    r"|__[a-zA-Z0-9_-]+__"
+    r")$"
+)
+
+# Known dummy values (case-insensitive)
+KNOWN_DUMMY_VALUES = {
+    "your_api_key",
+    "changeme",
+    "insert-token-here",
+}
+
+# False-negative prevention patterns (synthetic AWS/GitHub token shapes).
+FALSE_NEGATIVE_PATTERN = re.compile(r"^(AKIA[A-Z0-9]{16}|ghp_[a-zA-Z0-9]{36,})$")
+
+
+def calculate_entropy(s: str) -> float:
+    """Calculate the Shannon Entropy of a string."""
+    if not s:
+        return 0.0
+    total_len = len(s)
+    counts = Counter(s)
+    entropy = 0.0
+    for count in counts.values():
+        p = count / total_len
+        entropy -= p * math.log2(p)
+    return entropy
+
+
+def noise_reason(item: dict) -> str | None:
+    """Return the noise reason for a Gitleaks item, or None when it should pass.
+
+    Parameters
+    ----------
+    item : dict
+        A single Gitleaks JSON finding item (containing 'Secret', 'Match', etc.)
+
+    Returns
+    -------
+    str | None
+        A non-sensitive reason string when the item is classified as noise.
+        None when the item should not be filtered.
+    """
+    secret = item.get("Secret", "")
+    if not isinstance(secret, str) or not secret:
+        return "empty-secret"
+
+    # 1. False-Negative Prevention
+    if FALSE_NEGATIVE_PATTERN.match(secret):
+        return None
+
+    # 2. Template placeholders
+    if TEMPLATE_PATTERN.match(secret):
+        return "template-placeholder"
+
+    # 3. Known dummy values (case-insensitive)
+    if secret.lower() in KNOWN_DUMMY_VALUES:
+        return "known-dummy-value"
+
+    # 4. Repeated characters
+    if len(secret) >= 1 and len(set(secret)) == 1:
+        return "repeated-character"
+
+    # 5. Low entropy & short strings
+    if len(secret) <= 5:
+        return "short-secret"
+
+    entropy = calculate_entropy(secret)
+    if len(secret) < 10 and entropy < 1.8:
+        return "low-entropy-short-secret"
+    if len(secret) >= 10 and entropy < 2.5:
+        return "low-entropy-secret"
+
+    return None
+
+
+def should_filter_item(item: dict) -> bool:
+    """Determine if a Gitleaks finding item should be filtered out as noise."""
+    return noise_reason(item) is not None
diff --git a/src/security_scanner/scanners/gitleaks/parser.py b/src/security_scanner/scanners/gitleaks/parser.py
@@ -7,6 +7,8 @@
 from pathlib import Path
 
 from security_scanner.core.finding.model import Finding
+from security_scanner.core.scan.options import ScanOptions
+from security_scanner.scanners.gitleaks.filter import noise_reason
 from security_scanner.scanners.gitleaks.mapper import map_gitleaks_item
 
 
@@ -25,6 +27,7 @@ def parse_gitleaks_report(
     rule_pack_version: str,
     source_root: Path | None = None,
     source_tool: str = "gitleaks",
+    scan_options: ScanOptions | None = None,
 ) -> list[Finding]:
     """Parse a Gitleaks JSON report string into core Finding objects."""
     if not raw_json or not raw_json.strip():
@@ -47,10 +50,24 @@ def parse_gitleaks_report(
         return []
 
     findings: list[Finding] = []
+    enable_noise_filter = scan_options.enable_noise_filter if scan_options is not None else True
+
     for index, item in enumerate(data):
         if not isinstance(item, dict):
             logger.warning("GitleaksParser: skipping non-dict item at index %d", index)
             continue
+
+        reason = noise_reason(item) if enable_noise_filter else None
+        if reason is not None:
+            logger.debug(
+                "GitleaksParser: filtering out noise item at index %d "
+                "for rule %s: %s",
+                index,
+                item.get("RuleID", "<unknown>"),
+                reason,
+            )
+            continue
+
         finding = map_gitleaks_item(
             item,
             repo_full_name=repo_full_name,

diff --git a/src/security_scanner/scanners/gitleaks/scanner.py b/src/security_scanner/scanners/gitleaks/scanner.py
@@ -114,4 +114,5 @@ def scan(
             rule_pack_version=rule_pack_version,
             source_root=root,
             source_tool=self.name,
+            scan_options=scan_options,
         )
diff --git a/src/security_scanner/targets/manifest.py b/src/security_scanner/targets/manifest.py
@@ -111,11 +111,17 @@ def _parse_scan(raw: Any) -> ScanOptions:
 
     include_history = raw.get("include_history", True)
     exclude = raw.get("exclude", [])
+    enable_noise_filter = raw.get("enable_noise_filter", True)
 
     if not isinstance(include_history, bool):
         raise ManifestError(
             f"scan.include_history must be a boolean, got {include_history!r}"
         )
+    if not isinstance(enable_noise_filter, bool):
+        raise ManifestError(
+            "scan.enable_noise_filter must be a boolean, "
+            f"got {enable_noise_filter!r}"
+        )
     if not isinstance(exclude, list):
         raise ManifestError(
             f"scan.exclude must be a list, got {type(exclude).__name__}"
@@ -126,7 +132,11 @@ def _parse_scan(raw: Any) -> ScanOptions:
                 f"scan.exclude[{i}] must be a string, got {pattern!r}"
             )
 
-    return ScanOptions(include_history=include_history, exclude=list(exclude))
+    return ScanOptions(
+        include_history=include_history,
+        exclude=list(exclude),
+        enable_noise_filter=enable_noise_filter,
+    )
 
 
 def load_manifest(path: str | Path) -> Manifest:

diff --git a/tests/test_gitleaks_filter.py b/tests/test_gitleaks_filter.py
@@ -0,0 +1,100 @@
+"""Unit tests for Gitleaks noise filter."""
+
+from __future__ import annotations
+
+from security_scanner.scanners.gitleaks.filter import (
+    calculate_entropy,
+    noise_reason,
+    should_filter_item,
+)
+
+
+FAKE_AWS_ACCESS_KEY_ID = "AKIAFAKEEXAMPLE00000"
+FAKE_GITHUB_TOKEN = "ghp_FAKEtoken123456789012345678901234567"
+
+
+def test_template_placeholders():
+    placeholders = [
+        "${VAR}",
+        "{{secret}}",
+        "<VAR>",
+        "[VAR]",
+        "%VAR%",
+        "__VAR__",
+        "${SOME_ENV_VARIABLE}",
+        "{{database_password}}",
+    ]
+    for ph in placeholders:
+        assert should_filter_item({"Secret": ph}) is True
+
+
+def test_known_dummy_values():
+    dummies = [
+        "your_api_key",
+        "YOUR_API_KEY",
+        "CHANGEME",
+        "changeme",
+        "insert-token-here",
+        "Insert-Token-Here",
+    ]
+    for dummy in dummies:
+        assert should_filter_item({"Secret": dummy}) is True
+
+
+def test_repeated_characters():
+    repeated = [
+        "xxxxxx",
+        "aaaaaa",
+        "11111",
+        "ZZZZZZZZ",
+    ]
+    for rep in repeated:
+        assert should_filter_item({"Secret": rep}) is True
+
+
+def test_low_entropy_and_short_strings():
+    # Length <= 5: always filtered
+    assert should_filter_item({"Secret": ""}) is True
+    assert should_filter_item({"Secret": "abcd"}) is True
+    assert should_filter_item({"Secret": "12345"}) is True
+
+    # Length < 10 and entropy < 1.8: filtered
+    # "1231231" has length 7, entropy is 1.556 < 1.8
+    assert should_filter_item({"Secret": "1231231"}) is True
+
+    # Length < 10 and entropy >= 1.8: NOT filtered
+    # "abcdefg" has length 7, entropy is 2.807 >= 1.8
+    assert should_filter_item({"Secret": "abcdefg"}) is False
+
+    # Length >= 10 but low entropy: filtered
+    assert should_filter_item({"Secret": "aaaaabbbbb"}) is True
+
+    # Length >= 10 with enough entropy: NOT filtered
+    assert should_filter_item({"Secret": "abcdefghi0"}) is False
+
+
+def test_false_negatives_prevention():
+    # AWS Access Key format (typically 20 chars, starting with AKIA)
+    assert should_filter_item({"Secret": FAKE_AWS_ACCESS_KEY_ID}) is False
+    # GitHub Token format (typically 40 chars, starting with ghp_)
+    assert should_filter_item({"Secret": FAKE_GITHUB_TOKEN}) is False
+
+
+def test_noise_reason_does_not_include_secret_value():
+    secret = "${DATABASE_PASSWORD}"
+    reason = noise_reason({"Secret": secret})
+
+    assert reason == "template-placeholder"
+    assert secret not in reason
+
+
+def test_noise_reason_handles_non_string_secret_values():
+    for secret in (None, 123, True, [], {}):
+        assert noise_reason({"Secret": secret}) == "empty-secret"
+        assert should_filter_item({"Secret": secret}) is True
+
+
+def test_calculate_entropy():
+    assert calculate_entropy("") == 0.0
+    assert calculate_entropy("a") == 0.0
+    assert abs(calculate_entropy("ab") - 1.0) < 1e-9