From b835632c485aee68717ef19447f9e468604feff1 Mon Sep 17 00:00:00 2001 From: pureliture Date: Fri, 12 Jun 2026 15:02:15 +0900 Subject: [PATCH 1/2] feat(scanner): implement false-positive noise filter Resolves #3. Co-Authored-By: Codex GPT-5 --- .../views/research-and-technical-decisions.md | 11 ++ src/security_scanner/core/scan/options.py | 6 + .../scanners/gitleaks/filter.py | 97 +++++++++++++++ .../scanners/gitleaks/parser.py | 17 +++ .../scanners/gitleaks/scanner.py | 1 + src/security_scanner/targets/manifest.py | 12 +- tests/test_gitleaks_filter.py | 94 +++++++++++++++ tests/test_gitleaks_filter_stress.py | 110 ++++++++++++++++++ tests/test_gitleaks_parser.py | 76 ++++++++++++ tests/test_gitleaks_scanner.py | 19 +++ tests/test_manifest.py | 35 ++++++ 11 files changed, 477 insertions(+), 1 deletion(-) create mode 100644 src/security_scanner/scanners/gitleaks/filter.py create mode 100644 tests/test_gitleaks_filter.py create mode 100644 tests/test_gitleaks_filter_stress.py diff --git a/docs/views/research-and-technical-decisions.md b/docs/views/research-and-technical-decisions.md index 37091ba..cbab32c 100644 --- a/docs/views/research-and-technical-decisions.md +++ b/docs/views/research-and-technical-decisions.md @@ -33,3 +33,14 @@ ## 공개 문서에 남길 수 있는 범위 공개 문서는 tool role과 decision rationale만 설명합니다. 비공개 benchmark data, 민감한 alert data, internal repository context, private provider endpoint는 제외합니다. + +## 노이즈 필터 위치 결정 + +| 필터 위치 후보 | 장점 | 단점 | 선택 여부 | +| --- | --- | --- | --- | +| **Gitleaks Parser 단** | - 불필요한 노이즈가 조기에 필터링되어 스토리지 저장 및 검증 비용 절감
- 파싱 단계에서 간결하게 스키마 맵핑 전 걸러낼 수 있음 | - 원본 Gitleaks 레포트에 어떤 노이즈가 포함되었는지 기록이 남지 않음 (로그로만 남음) | **선택** | +| **NoSQL Storage 저장 단** | - 원본 파싱 데이터는 유지할 수 있고, 저장 시점에 선택적 필터링 가능 | - 저장소 로직이 무거워지고 불필요한 파싱 데이터 객체 생성이 발생함 | 미선택 | +| **LLM Verifier 단** | - LLM의 컨텍스트를 활용한 고차원 필터링 가능 | - API 호출 비용 및 latency 증가, 비용 낭비 발생 | 미선택 | + +### 선택 이유 +Gitleaks 결과 파싱 단계에서 노이즈를 걸러냄으로써 불필요한 NoSQL Storage DB 쓰기 및 LLM Verifier 검증 비용을 최소화하고, 스캐닝 파이프라인의 효율성을 극대화하기 위해 선택했습니다. diff --git a/src/security_scanner/core/scan/options.py b/src/security_scanner/core/scan/options.py index 7e27478..85e6ab0 100644 --- a/src/security_scanner/core/scan/options.py +++ b/src/security_scanner/core/scan/options.py @@ -24,7 +24,13 @@ class ScanOptions: Glob patterns for paths to exclude from the scan. NOTE(phase2): These are accepted but not yet applied to gitleaks command args — future milestone will map them to gitleaks allowlist flags. + enable_noise_filter: + When True (default), parser-level Gitleaks noise filtering removes + low-signal candidates before storage and optional verifier steps. + When False, all Gitleaks report items that map successfully are passed + through, which may increase false positives and output volume. """ include_history: bool = True exclude: list[str] = field(default_factory=list) + enable_noise_filter: bool = True diff --git a/src/security_scanner/scanners/gitleaks/filter.py b/src/security_scanner/scanners/gitleaks/filter.py new file mode 100644 index 0000000..38489d2 --- /dev/null +++ b/src/security_scanner/scanners/gitleaks/filter.py @@ -0,0 +1,97 @@ +"""Noise filter implementation for Gitleaks findings.""" + +from __future__ import annotations + +import math +import re +from collections import Counter + +# Template placeholder patterns (e.g., ${VAR}, {{secret}}, , [VAR], %VAR%, __VAR__) +TEMPLATE_PATTERNS = [ + re.compile(r'^\$\{[a-zA-Z0-9_-]+\}$'), + re.compile(r'^\{\{[a-zA-Z0-9_-]+\}\}$'), + re.compile(r'^<[a-zA-Z0-9_-]+>$'), + re.compile(r'^\[[a-zA-Z0-9_-]+\]$'), + re.compile(r'^%[a-zA-Z0-9_-]+%$'), + re.compile(r'^__[a-zA-Z0-9_-]+__$'), +] + +# Known dummy values (case-insensitive) +KNOWN_DUMMY_VALUES = { + "your_api_key", + "changeme", + "insert-token-here", +} + +# False-negative prevention patterns (synthetic AWS/GitHub token shapes). +FALSE_NEGATIVE_PATTERNS = [ + re.compile(r'^AKIA[A-Z0-9]{16}$'), + re.compile(r'^ghp_[a-zA-Z0-9]{36,}$'), +] + + +def calculate_entropy(s: str) -> float: + """Calculate the Shannon Entropy of a string.""" + if not s: + return 0.0 + total_len = len(s) + counts = Counter(s) + entropy = 0.0 + for count in counts.values(): + p = count / total_len + entropy -= p * math.log2(p) + return entropy + + +def noise_reason(item: dict) -> str | None: + """Return the noise reason for a Gitleaks item, or None when it should pass. + + Parameters + ---------- + item : dict + A single Gitleaks JSON finding item (containing 'Secret', 'Match', etc.) + + Returns + ------- + str | None + A non-sensitive reason string when the item is classified as noise. + None when the item should not be filtered. + """ + secret = item.get("Secret", "") + if not secret: + return "empty-secret" + + # 1. False-Negative Prevention + for pattern in FALSE_NEGATIVE_PATTERNS: + if pattern.match(secret): + return None + + # 2. Template placeholders + for pattern in TEMPLATE_PATTERNS: + if pattern.match(secret): + return "template-placeholder" + + # 3. Known dummy values (case-insensitive) + if secret.lower() in KNOWN_DUMMY_VALUES: + return "known-dummy-value" + + # 4. Repeated characters + if len(secret) >= 1 and len(set(secret)) == 1: + return "repeated-character" + + # 5. Low entropy & short strings + if len(secret) <= 5: + return "short-secret" + + entropy = calculate_entropy(secret) + if len(secret) < 10 and entropy < 1.8: + return "low-entropy-short-secret" + if len(secret) >= 10 and entropy < 2.5: + return "low-entropy-secret" + + return None + + +def should_filter_item(item: dict) -> bool: + """Determine if a Gitleaks finding item should be filtered out as noise.""" + return noise_reason(item) is not None diff --git a/src/security_scanner/scanners/gitleaks/parser.py b/src/security_scanner/scanners/gitleaks/parser.py index bd8caaa..87f9586 100644 --- a/src/security_scanner/scanners/gitleaks/parser.py +++ b/src/security_scanner/scanners/gitleaks/parser.py @@ -7,6 +7,8 @@ from pathlib import Path from security_scanner.core.finding.model import Finding +from security_scanner.core.scan.options import ScanOptions +from security_scanner.scanners.gitleaks.filter import noise_reason from security_scanner.scanners.gitleaks.mapper import map_gitleaks_item @@ -25,6 +27,7 @@ def parse_gitleaks_report( rule_pack_version: str, source_root: Path | None = None, source_tool: str = "gitleaks", + scan_options: ScanOptions | None = None, ) -> list[Finding]: """Parse a Gitleaks JSON report string into core Finding objects.""" if not raw_json or not raw_json.strip(): @@ -47,10 +50,24 @@ def parse_gitleaks_report( return [] findings: list[Finding] = [] + enable_noise_filter = scan_options.enable_noise_filter if scan_options is not None else True + for index, item in enumerate(data): if not isinstance(item, dict): logger.warning("GitleaksParser: skipping non-dict item at index %d", index) continue + + reason = noise_reason(item) if enable_noise_filter else None + if reason is not None: + logger.debug( + "GitleaksParser: filtering out noise item at index %d " + "for rule %s: %s", + index, + item.get("RuleID", ""), + reason, + ) + continue + finding = map_gitleaks_item( item, repo_full_name=repo_full_name, diff --git a/src/security_scanner/scanners/gitleaks/scanner.py b/src/security_scanner/scanners/gitleaks/scanner.py index 2e958e5..91e5cdd 100644 --- a/src/security_scanner/scanners/gitleaks/scanner.py +++ b/src/security_scanner/scanners/gitleaks/scanner.py @@ -114,4 +114,5 @@ def scan( rule_pack_version=rule_pack_version, source_root=root, source_tool=self.name, + scan_options=scan_options, ) diff --git a/src/security_scanner/targets/manifest.py b/src/security_scanner/targets/manifest.py index 7e4fad6..0090068 100644 --- a/src/security_scanner/targets/manifest.py +++ b/src/security_scanner/targets/manifest.py @@ -111,11 +111,17 @@ def _parse_scan(raw: Any) -> ScanOptions: include_history = raw.get("include_history", True) exclude = raw.get("exclude", []) + enable_noise_filter = raw.get("enable_noise_filter", True) if not isinstance(include_history, bool): raise ManifestError( f"scan.include_history must be a boolean, got {include_history!r}" ) + if not isinstance(enable_noise_filter, bool): + raise ManifestError( + "scan.enable_noise_filter must be a boolean, " + f"got {enable_noise_filter!r}" + ) if not isinstance(exclude, list): raise ManifestError( f"scan.exclude must be a list, got {type(exclude).__name__}" @@ -126,7 +132,11 @@ def _parse_scan(raw: Any) -> ScanOptions: f"scan.exclude[{i}] must be a string, got {pattern!r}" ) - return ScanOptions(include_history=include_history, exclude=list(exclude)) + return ScanOptions( + include_history=include_history, + exclude=list(exclude), + enable_noise_filter=enable_noise_filter, + ) def load_manifest(path: str | Path) -> Manifest: diff --git a/tests/test_gitleaks_filter.py b/tests/test_gitleaks_filter.py new file mode 100644 index 0000000..96fe0bb --- /dev/null +++ b/tests/test_gitleaks_filter.py @@ -0,0 +1,94 @@ +"""Unit tests for Gitleaks noise filter.""" + +from __future__ import annotations + +from security_scanner.scanners.gitleaks.filter import ( + calculate_entropy, + noise_reason, + should_filter_item, +) + + +FAKE_AWS_ACCESS_KEY_ID = "AKIAFAKEEXAMPLE00000" +FAKE_GITHUB_TOKEN = "ghp_FAKEtoken123456789012345678901234567" + + +def test_template_placeholders(): + placeholders = [ + "${VAR}", + "{{secret}}", + "", + "[VAR]", + "%VAR%", + "__VAR__", + "${SOME_ENV_VARIABLE}", + "{{database_password}}", + ] + for ph in placeholders: + assert should_filter_item({"Secret": ph}) is True + + +def test_known_dummy_values(): + dummies = [ + "your_api_key", + "YOUR_API_KEY", + "CHANGEME", + "changeme", + "insert-token-here", + "Insert-Token-Here", + ] + for dummy in dummies: + assert should_filter_item({"Secret": dummy}) is True + + +def test_repeated_characters(): + repeated = [ + "xxxxxx", + "aaaaaa", + "11111", + "ZZZZZZZZ", + ] + for rep in repeated: + assert should_filter_item({"Secret": rep}) is True + + +def test_low_entropy_and_short_strings(): + # Length <= 5: always filtered + assert should_filter_item({"Secret": ""}) is True + assert should_filter_item({"Secret": "abcd"}) is True + assert should_filter_item({"Secret": "12345"}) is True + + # Length < 10 and entropy < 1.8: filtered + # "1231231" has length 7, entropy is 1.556 < 1.8 + assert should_filter_item({"Secret": "1231231"}) is True + + # Length < 10 and entropy >= 1.8: NOT filtered + # "abcdefg" has length 7, entropy is 2.807 >= 1.8 + assert should_filter_item({"Secret": "abcdefg"}) is False + + # Length >= 10 but low entropy: filtered + assert should_filter_item({"Secret": "aaaaabbbbb"}) is True + + # Length >= 10 with enough entropy: NOT filtered + assert should_filter_item({"Secret": "abcdefghi0"}) is False + + +def test_false_negatives_prevention(): + # AWS Access Key format (typically 20 chars, starting with AKIA) + assert should_filter_item({"Secret": FAKE_AWS_ACCESS_KEY_ID}) is False + # GitHub Token format (typically 40 chars, starting with ghp_) + assert should_filter_item({"Secret": FAKE_GITHUB_TOKEN}) is False + + +def test_noise_reason_does_not_include_secret_value(): + secret = "${DATABASE_PASSWORD}" + reason = noise_reason({"Secret": secret}) + + assert reason == "template-placeholder" + assert secret not in reason + + +def test_calculate_entropy(): + assert calculate_entropy("") == 0.0 + assert calculate_entropy("a") == 0.0 + assert abs(calculate_entropy("ab") - 1.0) < 1e-9 diff --git a/tests/test_gitleaks_filter_stress.py b/tests/test_gitleaks_filter_stress.py new file mode 100644 index 0000000..5c22208 --- /dev/null +++ b/tests/test_gitleaks_filter_stress.py @@ -0,0 +1,110 @@ +"""Stress and boundary tests for Gitleaks noise filter and parser integration.""" + +from __future__ import annotations + +import json + +from security_scanner.core.scan.options import ScanOptions +from security_scanner.scanners.gitleaks.filter import ( + calculate_entropy, + should_filter_item, +) +from security_scanner.scanners.gitleaks.parser import parse_gitleaks_report + + +def test_entropy_boundary_cases(): + # 1. Length <= 5: Always filtered regardless of entropy + # "abcde" has entropy ~2.321 but length 5 + assert should_filter_item({"Secret": "abcde"}) is True + + # 2. Length < 10, Entropy < 1.8: Should be filtered + # "aaabbb" -> length 6, entropy 1.0 < 1.8 + assert calculate_entropy("aaabbb") == 1.0 + assert should_filter_item({"Secret": "aaabbb"}) is True + + # "aabbcc" -> length 6, entropy ~1.585 < 1.8 + assert abs(calculate_entropy("aabbcc") - 1.5849625) < 1e-5 + assert should_filter_item({"Secret": "aabbcc"}) is True + + # 3. Length < 10, Entropy >= 1.8: Should NOT be filtered + # "aabbcd" -> length 6, entropy ~1.918 >= 1.8 + assert calculate_entropy("aabbcd") > 1.8 + assert should_filter_item({"Secret": "aabbcd"}) is False + + # 4. Length >= 10: low entropy is still noise + # "aaaaabbbbb" -> length 10, entropy 1.0 < 2.5 + assert calculate_entropy("aaaaabbbbb") == 1.0 + assert should_filter_item({"Secret": "aaaaabbbbb"}) is True + + +def test_extremely_long_strings_and_special_chars(): + # 1. Extremely long string (100,000 characters of 'a') -> Repeated chars condition -> filtered + long_repeated = "a" * 100000 + assert should_filter_item({"Secret": long_repeated}) is True + + # 2. Extremely long string (10,000 characters) with high entropy -> NOT filtered + # Creating a long pattern with high entropy (abcdefg...) + long_high_entropy = "".join(chr(97 + (i % 26)) for i in range(10000)) + assert should_filter_item({"Secret": long_high_entropy}) is False + + # 3. Special characters strings + # "!@#$%" -> length 5 -> filtered + assert should_filter_item({"Secret": "!@#$%"}) is True + # "!@#$%^&*" -> length 8, entropy 3.0 >= 1.8 -> NOT filtered + assert calculate_entropy("!@#$%^&*") == 3.0 + assert should_filter_item({"Secret": "!@#$%^&*"}) is False + + +def test_false_negatives_prevention_robustness(): + # 1. Public-safe AWS access key shape, but with low entropy after prefix. + # It matches the false negative regex, so it must not be filtered. + assert should_filter_item({"Secret": "AKIAAAAAAAAAAAAAAAAA"}) is False + + # 2. AWS-looking prefix, but too short for access key ID shape. + assert should_filter_item({"Secret": "AKIA1"}) is True + + # 3. GitHub-looking prefix, but too short for token shape. + assert should_filter_item({"Secret": "ghp_1"}) is True + + # 4. Full public-safe shapes pass through. + assert should_filter_item({"Secret": "AKIAFAKEEXAMPLE00000"}) is False + assert should_filter_item({"Secret": "ghp_FAKE000012345678901234567890123456"}) is False + + +def test_parser_integration_noise_filter_disabled(): + report_data = [ + # Template placeholder + {"RuleID": "r1", "File": "f1", "StartLine": 1, "Secret": "${DUMMY}"}, + # Known dummy value + {"RuleID": "r2", "File": "f1", "StartLine": 2, "Secret": "changeme"}, + # Repeated character + {"RuleID": "r3", "File": "f1", "StartLine": 3, "Secret": "zzzzzz"}, + # Low entropy < 1.8, len < 10 + {"RuleID": "r4", "File": "f1", "StartLine": 4, "Secret": "aaabbb"}, + # Short string <= 5 + {"RuleID": "r5", "File": "f1", "StartLine": 5, "Secret": "123"}, + ] + raw_json = json.dumps(report_data) + + # With enable_noise_filter = False, all 5 must be parsed as findings + findings = parse_gitleaks_report( + raw_json, + repo_full_name="org/repo", + scan_run_id="run1", + rule_pack_version="1.0", + scan_options=ScanOptions(enable_noise_filter=False), + ) + + assert len(findings) == 5 + parsed_secrets = {f.gitleaks.secret for f in findings} + assert parsed_secrets == {"${DUMMY}", "changeme", "zzzzzz", "aaabbb", "123"} + + # With enable_noise_filter = True, all 5 must be filtered out + findings_filtered = parse_gitleaks_report( + raw_json, + repo_full_name="org/repo", + scan_run_id="run1", + rule_pack_version="1.0", + scan_options=ScanOptions(enable_noise_filter=True), + ) + assert len(findings_filtered) == 0 diff --git a/tests/test_gitleaks_parser.py b/tests/test_gitleaks_parser.py index b8752bc..be62a00 100644 --- a/tests/test_gitleaks_parser.py +++ b/tests/test_gitleaks_parser.py @@ -3,9 +3,11 @@ from __future__ import annotations import json +import logging import pytest +from security_scanner.core.scan.options import ScanOptions from security_scanner.scanners.gitleaks.parser import ( GitleaksParseError, parse_gitleaks_report, @@ -81,3 +83,77 @@ def test_parse_gitleaks_report_ignores_non_array_json(): ) == [] ) + + +def test_parse_gitleaks_report_with_noise_filter_enabled_and_disabled(): + report = json.dumps( + [ + { + "RuleID": "aws-key", + "File": "config/settings.py", + "StartLine": 7, + "Secret": "AKIAFAKEEXAMPLE00000", + }, + { + "RuleID": "dummy-token", + "File": "config/settings.py", + "StartLine": 8, + "Secret": "CHANGEME", + }, + { + "RuleID": "placeholder-token", + "File": "config/settings.py", + "StartLine": 9, + "Secret": "${DB_PASSWORD}", + }, + ] + ) + + # 1. When enable_noise_filter is True + findings_filtered = parse_gitleaks_report( + report, + repo_full_name=REPO_FULL_NAME, + scan_run_id=SCAN_RUN_ID, + rule_pack_version=RULE_PACK, + scan_options=ScanOptions(enable_noise_filter=True), + ) + assert len(findings_filtered) == 1 + assert findings_filtered[0].gitleaks.secret == "AKIAFAKEEXAMPLE00000" + + # 2. When enable_noise_filter is False + findings_unfiltered = parse_gitleaks_report( + report, + repo_full_name=REPO_FULL_NAME, + scan_run_id=SCAN_RUN_ID, + rule_pack_version=RULE_PACK, + scan_options=ScanOptions(enable_noise_filter=False), + ) + assert len(findings_unfiltered) == 3 + + +def test_parse_gitleaks_report_noise_filter_debug_log_redacts_secret(caplog): + secret = "${DB_PASSWORD}" + report = json.dumps( + [ + { + "RuleID": "placeholder-token", + "File": "config/settings.py", + "StartLine": 9, + "Secret": secret, + }, + ] + ) + + with caplog.at_level(logging.DEBUG, logger="security_scanner.scanners.gitleaks.parser"): + findings = parse_gitleaks_report( + report, + repo_full_name=REPO_FULL_NAME, + scan_run_id=SCAN_RUN_ID, + rule_pack_version=RULE_PACK, + scan_options=ScanOptions(enable_noise_filter=True), + ) + + assert findings == [] + assert "template-placeholder" in caplog.text + assert "placeholder-token" in caplog.text + assert secret not in caplog.text diff --git a/tests/test_gitleaks_scanner.py b/tests/test_gitleaks_scanner.py index ba2b94b..790af95 100644 --- a/tests/test_gitleaks_scanner.py +++ b/tests/test_gitleaks_scanner.py @@ -139,6 +139,25 @@ def test_scan_forwards_root_and_options_to_runner(self): assert runner.calls == [(FAKE_ROOT, opts)] + def test_scan_forwards_options_to_parser_noise_filter(self): + report = json.dumps([ + { + "RuleID": "dummy-token", + "File": "config/settings.py", + "StartLine": 1, + "Secret": "CHANGEME", + }, + ]) + scanner = GitleaksScanner(runner=FakeRunner(report)) + + assert _scan( + scanner, + scan_options=ScanOptions(enable_noise_filter=True), + ) == [] + assert len( + _scan(scanner, scan_options=ScanOptions(enable_noise_filter=False)) + ) == 1 + def test_scan_forwards_scan_context_into_findings(self): """repo_full_name, scan_run_id, and rule_pack_version reach Finding.""" scanner = GitleaksScanner(runner=FakeRunner(FAKE_REPORT_TWO_FINDINGS)) diff --git a/tests/test_manifest.py b/tests/test_manifest.py index 6b02f5b..6249359 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -49,6 +49,7 @@ def write_yaml(tmp_path: Path, content: str) -> Path: scan: include_history: true + enable_noise_filter: true exclude: - "**/node_modules/**" - "**/.venv/**" @@ -77,6 +78,7 @@ def test_load_valid_manifest(tmp_path: Path) -> None: assert isinstance(m.scan, ScanOptions) assert m.scan.include_history is True + assert m.scan.enable_noise_filter is True assert "**/node_modules/**" in m.scan.exclude # Empty gitleaks_config string is normalised to None @@ -138,9 +140,27 @@ def test_scan_defaults_when_section_absent(tmp_path: Path) -> None: m = load_manifest(p) assert m.scan.include_history is True + assert m.scan.enable_noise_filter is True assert m.scan.exclude == [] +def test_scan_enable_noise_filter_can_be_disabled(tmp_path: Path) -> None: + yaml_content = """\ + version: 1 + targets: + - name: demo-noise-filter-off + path: /tmp/demo-noise-filter-off + scan: + include_history: false + enable_noise_filter: false + """ + p = write_yaml(tmp_path, yaml_content) + m = load_manifest(p) + + assert m.scan.include_history is False + assert m.scan.enable_noise_filter is False + + def test_gitleaks_config_non_empty(tmp_path: Path) -> None: """A non-empty gitleaks_config string is preserved.""" yaml_content = """\ @@ -281,6 +301,21 @@ def test_non_string_exclude_item_raises(tmp_path: Path) -> None: load_manifest(p) +def test_scan_enable_noise_filter_wrong_type_raises(tmp_path: Path) -> None: + """scan.enable_noise_filter must be a boolean when present.""" + yaml_content = """\ + version: 1 + targets: + - name: demo-bad-noise-filter + path: /tmp/demo-bad-noise-filter + scan: + enable_noise_filter: "false" + """ + p = write_yaml(tmp_path, yaml_content) + with pytest.raises(ManifestError, match="enable_noise_filter"): + load_manifest(p) + + # --------------------------------------------------------------------------- # Guard: the committed example file must load without error # --------------------------------------------------------------------------- From 101fe751e6ec372c45418d206d984c0a6a38bc9e Mon Sep 17 00:00:00 2001 From: pureliture Date: Fri, 12 Jun 2026 15:22:04 +0900 Subject: [PATCH 2/2] fix(scanner): address noise filter review comments Resolve PR review feedback for the Gitleaks noise filter by combining repeated regex checks into single compiled patterns and handling non-string Secret values defensively. Co-Authored-By: Codex GPT-5 --- .../scanners/gitleaks/filter.py | 35 +++++++++---------- tests/test_gitleaks_filter.py | 6 ++++ 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/src/security_scanner/scanners/gitleaks/filter.py b/src/security_scanner/scanners/gitleaks/filter.py index 38489d2..9e35e7f 100644 --- a/src/security_scanner/scanners/gitleaks/filter.py +++ b/src/security_scanner/scanners/gitleaks/filter.py @@ -7,14 +7,16 @@ from collections import Counter # Template placeholder patterns (e.g., ${VAR}, {{secret}}, , [VAR], %VAR%, __VAR__) -TEMPLATE_PATTERNS = [ - re.compile(r'^\$\{[a-zA-Z0-9_-]+\}$'), - re.compile(r'^\{\{[a-zA-Z0-9_-]+\}\}$'), - re.compile(r'^<[a-zA-Z0-9_-]+>$'), - re.compile(r'^\[[a-zA-Z0-9_-]+\]$'), - re.compile(r'^%[a-zA-Z0-9_-]+%$'), - re.compile(r'^__[a-zA-Z0-9_-]+__$'), -] +TEMPLATE_PATTERN = re.compile( + r"^(" + r"\$\{[a-zA-Z0-9_-]+\}" + r"|\{\{[a-zA-Z0-9_-]+\}\}" + r"|<[a-zA-Z0-9_-]+>" + r"|\[[a-zA-Z0-9_-]+\]" + r"|%[a-zA-Z0-9_-]+%" + r"|__[a-zA-Z0-9_-]+__" + r")$" +) # Known dummy values (case-insensitive) KNOWN_DUMMY_VALUES = { @@ -24,10 +26,7 @@ } # False-negative prevention patterns (synthetic AWS/GitHub token shapes). -FALSE_NEGATIVE_PATTERNS = [ - re.compile(r'^AKIA[A-Z0-9]{16}$'), - re.compile(r'^ghp_[a-zA-Z0-9]{36,}$'), -] +FALSE_NEGATIVE_PATTERN = re.compile(r"^(AKIA[A-Z0-9]{16}|ghp_[a-zA-Z0-9]{36,})$") def calculate_entropy(s: str) -> float: @@ -58,18 +57,16 @@ def noise_reason(item: dict) -> str | None: None when the item should not be filtered. """ secret = item.get("Secret", "") - if not secret: + if not isinstance(secret, str) or not secret: return "empty-secret" # 1. False-Negative Prevention - for pattern in FALSE_NEGATIVE_PATTERNS: - if pattern.match(secret): - return None + if FALSE_NEGATIVE_PATTERN.match(secret): + return None # 2. Template placeholders - for pattern in TEMPLATE_PATTERNS: - if pattern.match(secret): - return "template-placeholder" + if TEMPLATE_PATTERN.match(secret): + return "template-placeholder" # 3. Known dummy values (case-insensitive) if secret.lower() in KNOWN_DUMMY_VALUES: diff --git a/tests/test_gitleaks_filter.py b/tests/test_gitleaks_filter.py index 96fe0bb..e68873e 100644 --- a/tests/test_gitleaks_filter.py +++ b/tests/test_gitleaks_filter.py @@ -88,6 +88,12 @@ def test_noise_reason_does_not_include_secret_value(): assert secret not in reason +def test_noise_reason_handles_non_string_secret_values(): + for secret in (None, 123, True, [], {}): + assert noise_reason({"Secret": secret}) == "empty-secret" + assert should_filter_item({"Secret": secret}) is True + + def test_calculate_entropy(): assert calculate_entropy("") == 0.0 assert calculate_entropy("a") == 0.0