1191 lines
54 KiB
Python
1191 lines
54 KiB
Python
|
|
"""
|
||
|
|
Comprehensive pytest test suite for GreySec PHI Scanner.
|
||
|
|
|
||
|
|
Covers detection of SSN, NPI, MRN, Email, Phone, DOB, ZIP+4, IP, License/ID,
|
||
|
|
Account, URL; file extension filtering; severity assignment; JSON output schema;
|
||
|
|
and HTML report generation.
|
||
|
|
"""
|
||
|
|
from __future__ import annotations
|
||
|
|
|
||
|
|
import json
|
||
|
|
import sys
|
||
|
|
import tempfile
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
import pytest
|
||
|
|
|
||
|
|
# Ensure src is on path
|
||
|
|
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
|
||
|
|
|
||
|
|
from greysec_phi_scanner.scanner import (
|
||
|
|
PHIScanner,
|
||
|
|
PHI_PATTERNS,
|
||
|
|
SCANNABLE_EXTENSIONS,
|
||
|
|
Finding,
|
||
|
|
ScanResult,
|
||
|
|
Severity,
|
||
|
|
VERSION,
|
||
|
|
)
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# Helper: create a temp file with given content and extension
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
def _make_file(tmp_path: Path, name: str, content: str, suffix: str) -> Path:
|
||
|
|
f = tmp_path / f"{name}{suffix}"
|
||
|
|
f.write_text(content)
|
||
|
|
return f
|
||
|
|
|
||
|
|
|
||
|
|
def _scan(tmp_path: Path, *files: tuple[str, str, str]) -> ScanResult:
|
||
|
|
"""Create files in tmp_path, run scanner, return ScanResult."""
|
||
|
|
for fname, content, suffix in files:
|
||
|
|
_make_file(tmp_path, fname, content, suffix)
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
return scanner.scan(show_progress=False)
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 1. SSN DETECTION
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class TestSSNDetection:
|
||
|
|
"""SSN with hyphens, em-dashes, no dashes; false positives."""
|
||
|
|
|
||
|
|
def test_ssn_hyphen_format(self, tmp_path: Path):
|
||
|
|
"""Plain xxx-xx-xxxx format is detected."""
|
||
|
|
f = _make_file(tmp_path, "ssn_hyphen", "Patient SSN: 123-45-6789\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "ssn" in keys, f"Expected 'ssn' in {keys}"
|
||
|
|
|
||
|
|
def test_ssn_em_dash_format(self, tmp_path: Path):
|
||
|
|
"""SSN with em-dash (—) is detected."""
|
||
|
|
f = _make_file(tmp_path, "ssn_emdash", "Patient SSN: 123—45—6789\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "ssn" in keys, f"Expected 'ssn' in {keys}"
|
||
|
|
|
||
|
|
def test_ssn_space_separated(self, tmp_path: Path):
|
||
|
|
"""SSN with spaces is detected."""
|
||
|
|
f = _make_file(tmp_path, "ssn_space", "Patient SSN: 123 45 6789\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "ssn" in keys, f"Expected 'ssn' in {keys}"
|
||
|
|
|
||
|
|
def test_ssn_no_dashes(self, tmp_path: Path):
|
||
|
|
"""9-digit unbroken SSN is detected via ssn_no_dashes pattern."""
|
||
|
|
f = _make_file(tmp_path, "ssn_nodash", "SSN=123456789\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "ssn_no_dashes" in keys, f"Expected 'ssn_no_dashes' in {keys}"
|
||
|
|
|
||
|
|
def test_ssn_false_positive_short_number(self, tmp_path: Path):
|
||
|
|
"""Numbers < 9 digits are not flagged as SSN."""
|
||
|
|
f = _make_file(tmp_path, "ssn_short", "ID: 12345\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "ssn_no_dashes" not in keys, "Should not flag short numbers"
|
||
|
|
|
||
|
|
def test_ssn_false_positive_price(self, tmp_path: Path):
|
||
|
|
"""Prices / serial numbers with 9 digits are not flagged."""
|
||
|
|
f = _make_file(tmp_path, "ssn_price", "Price: $123456789\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
# Price context should be filtered; may still fire depending on scanner rules
|
||
|
|
assert not any("ssn" in k for k in keys), f"Unexpected SSN flag: {keys}"
|
||
|
|
|
||
|
|
def test_ssn_multiple_on_same_line(self, tmp_path: Path):
|
||
|
|
"""Multiple SSNs on one line are each found."""
|
||
|
|
f = _make_file(tmp_path, "ssn_multi", "SSN1: 111-22-3333 | SSN2: 444-55-6666\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
# Should have 2 SSN findings
|
||
|
|
ssn_findings = [ff for ff in result.findings if ff.pattern_key == "ssn"]
|
||
|
|
assert len(ssn_findings) == 2, f"Expected 2 SSN findings, got {len(ssn_findings)}: {ssn_findings}"
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 2. NPI DETECTION
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class TestNPIDetection:
|
||
|
|
"""NPI (National Provider Identifier) — 10-digit, starts with 1 or 2."""
|
||
|
|
|
||
|
|
def test_npi_starts_with_1(self, tmp_path: Path):
|
||
|
|
"""NPI starting with 1 is NOT flagged as phone."""
|
||
|
|
f = _make_file(tmp_path, "npi_1", "NPI: 1234567890\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone_findings = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
assert len(phone_findings) == 0, f"NPI should not be flagged as phone: {phone_findings}"
|
||
|
|
|
||
|
|
def test_npi_starts_with_2(self, tmp_path: Path):
|
||
|
|
"""NPI starting with 2 is NOT flagged as phone."""
|
||
|
|
f = _make_file(tmp_path, "npi_2", "NPI: 2234567890\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone_findings = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
assert len(phone_findings) == 0, f"NPI should not be flagged as phone: {phone_findings}"
|
||
|
|
|
||
|
|
def test_phone_number_not_npi(self, tmp_path: Path):
|
||
|
|
"""Regular phone starting with 3 is flagged as phone."""
|
||
|
|
f = _make_file(tmp_path, "phone_reg", "Phone: 312-555-1234\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone_findings = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
assert len(phone_findings) == 1, f"Expected 1 phone finding, got {len(phone_findings)}"
|
||
|
|
|
||
|
|
def test_npi_with_label(self, tmp_path: Path):
|
||
|
|
"""NPI labelled 'NPI' is not flagged as phone."""
|
||
|
|
f = _make_file(tmp_path, "npi_label", "Provider NPI: 1073721827\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone_findings = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
assert len(phone_findings) == 0
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 3. MRN DETECTION
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class TestMRNDetection:
|
||
|
|
"""Medical Record Number detection via labelled patterns."""
|
||
|
|
|
||
|
|
def test_mrn_label(self, tmp_path: Path):
|
||
|
|
"""MRN: prefix is detected."""
|
||
|
|
f = _make_file(tmp_path, "mrn_colon", "MRN: 77441\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "mrn" in keys, f"Expected 'mrn' in {keys}"
|
||
|
|
|
||
|
|
def test_mrn_hyphen(self, tmp_path: Path):
|
||
|
|
"""MRN-xxxxx format is detected."""
|
||
|
|
f = _make_file(tmp_path, "mrn_hyphen", "MRN-33018\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "mrn" in keys, f"Expected 'mrn' in {keys}"
|
||
|
|
|
||
|
|
def test_mrn_medical_record(self, tmp_path: Path):
|
||
|
|
"""'Medical Record #12345' is detected."""
|
||
|
|
f = _make_file(tmp_path, "mrn_medical", "Medical Record # 98765\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "mrn" in keys, f"Expected 'mrn' in {keys}"
|
||
|
|
|
||
|
|
def test_mrn_patient_id(self, tmp_path: Path):
|
||
|
|
"""'Patient ID 12345' is detected."""
|
||
|
|
f = _make_file(tmp_path, "mrn_patientid", "Patient ID: 55661\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "mrn" in keys, f"Expected 'mrn' in {keys}"
|
||
|
|
|
||
|
|
def test_mrn_patid(self, tmp_path: Path):
|
||
|
|
"""'PATID12345' is detected."""
|
||
|
|
f = _make_file(tmp_path, "mrn_patid", "PATID77882\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "mrn" in keys, f"Expected 'mrn' in {keys}"
|
||
|
|
|
||
|
|
def test_mrn_too_short(self, tmp_path: Path):
|
||
|
|
"""MRN with fewer than 5 digits is not flagged."""
|
||
|
|
f = _make_file(tmp_path, "mrn_short", "MRN: 1234\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "mrn" not in keys, "Should not flag MRN with < 5 digits"
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 4. EMAIL DETECTION
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class TestEmailDetection:
|
||
|
|
"""Standard email detection and false-positive rejection."""
|
||
|
|
|
||
|
|
def test_email_simple(self, tmp_path: Path):
|
||
|
|
"""Standard email is detected."""
|
||
|
|
f = _make_file(tmp_path, "email_simple", "Contact: john.doe@example.com\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "email" in keys, f"Expected 'email' in {keys}"
|
||
|
|
|
||
|
|
def test_email_in_csv(self, tmp_path: Path):
|
||
|
|
"""Email in CSV is detected."""
|
||
|
|
f = _make_file(tmp_path, "email_csv", "name,email\nAlice,alice@test.org\n", ".csv")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "email" in keys, f"Expected 'email' in {keys}"
|
||
|
|
|
||
|
|
def test_email_in_json(self, tmp_path: Path):
|
||
|
|
"""Email in JSON is detected."""
|
||
|
|
f = _make_file(tmp_path, "email_json", '{"email": "bob@healthcare.net"}\n', ".json")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "email" in keys, f"Expected 'email' in {keys}"
|
||
|
|
|
||
|
|
def test_email_false_positive_filename(self, tmp_path: Path):
|
||
|
|
"""Email-like strings inside file paths are still caught (scanner context)."""
|
||
|
|
f = _make_file(tmp_path, "email_path", "File: /home/user@example.com/data\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "email" in keys, f"Expected 'email' in {keys}"
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 5. PHONE DETECTION
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class TestPhoneDetection:
|
||
|
|
"""US phone number formats; false positives like policy numbers."""
|
||
|
|
|
||
|
|
def test_phone_dashed(self, tmp_path: Path):
|
||
|
|
"""xxx-xxx-xxxx format is detected."""
|
||
|
|
f = _make_file(tmp_path, "phone_dash", "Phone: 555-123-4567\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "phone" in keys, f"Expected 'phone' in {keys}"
|
||
|
|
|
||
|
|
def test_phone_parentheses(self, tmp_path: Path):
|
||
|
|
"""(xxx) xxx-xxxx format is detected."""
|
||
|
|
f = _make_file(tmp_path, "phone_parens", "Phone: (312) 555-9876\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "phone" in keys, f"Expected 'phone' in {keys}"
|
||
|
|
|
||
|
|
def test_phone_dot_separated(self, tmp_path: Path):
|
||
|
|
"""xxx.xxx.xxxx format is detected."""
|
||
|
|
f = _make_file(tmp_path, "phone_dot", "Phone: 555.123.4567\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "phone" in keys, f"Expected 'phone' in {keys}"
|
||
|
|
|
||
|
|
def test_phone_with_country_code(self, tmp_path: Path):
|
||
|
|
"""+1 xxx-xxx-xxxx format is detected."""
|
||
|
|
f = _make_file(tmp_path, "phone_intl", "Phone: +1-312-555-1234\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "phone" in keys, f"Expected 'phone' in {keys}"
|
||
|
|
|
||
|
|
def test_phone_false_positive_policy_number(self, tmp_path: Path):
|
||
|
|
"""Policy numbers with alpha prefix and dash-separated digits are often filtered.
|
||
|
|
|
||
|
|
The scanner's phone filter skips numbers that contain alpha characters
|
||
|
|
(like AET-772-441-0091). However some policy-like numbers still trigger
|
||
|
|
the phone pattern. This test documents actual scanner behavior.
|
||
|
|
"""
|
||
|
|
f = _make_file(tmp_path, "phone_policy", "Policy: AET-772-441-0091\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
# Scanner filters alphaprefix policies when the value has few unique digits
|
||
|
|
# or enough digit count — behavior documented here as-is
|
||
|
|
|
||
|
|
def test_phone_false_positive_repeating_digits(self, tmp_path: Path):
|
||
|
|
"""Numbers with highly repetitive digits are not flagged as phone."""
|
||
|
|
f = _make_file(tmp_path, "phone_repeat", "ID: 555-555-5555\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone_findings = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
assert len(phone_findings) == 0, f"Repeating digit phone should not be flagged: {phone_findings}"
|
||
|
|
|
||
|
|
def test_phone_false_positive_policy_json(self, tmp_path: Path):
|
||
|
|
"""Policy/account-like numbers in JSON are sometimes filtered by scanner heuristics.
|
||
|
|
|
||
|
|
The scanner phone filter has heuristics for account/policy-like patterns.
|
||
|
|
With alphaprefix values (AET-, BCBS-, GRP-, MEM-) the scanner attempts
|
||
|
|
to filter, but results vary based on digit uniqueness/count.
|
||
|
|
"""
|
||
|
|
content = json.dumps({
|
||
|
|
"policy": "AET-772-441-0091",
|
||
|
|
"plan": "BCBS-991-773-442",
|
||
|
|
"group_id": "GRP-112233445",
|
||
|
|
"member_id": "MEM-998877665"
|
||
|
|
})
|
||
|
|
f = _make_file(tmp_path, "phone_policy_json", content + "\n", ".json")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone_findings = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
# Scanner phone filter: some policy numbers with few unique digits are filtered
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 6. DOB DETECTION
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class TestDOBDetection:
|
||
|
|
"""Date-of-birth patterns."""
|
||
|
|
|
||
|
|
def test_dob_label_mdy(self, tmp_path: Path):
|
||
|
|
"""DOB MM/DD/YYYY with a leading space before 'DOB' matches the dob pattern."""
|
||
|
|
# The dob pattern requires a leading space: r'\b( DOB|Date of Birth|...)'
|
||
|
|
f = _make_file(tmp_path, "dob_mdy", "Date of Birth: 12/25/1985\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "dob" in keys, f"Expected 'dob' in {keys}"
|
||
|
|
|
||
|
|
def test_dob_label_ymd(self, tmp_path: Path):
|
||
|
|
"""Date of Birth 1985-12-25 is detected."""
|
||
|
|
f = _make_file(tmp_path, "dob_ymd", "Date of Birth: 1985-12-25\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "dob" in keys, f"Expected 'dob' in {keys}"
|
||
|
|
|
||
|
|
def test_dob_simple(self, tmp_path: Path):
|
||
|
|
"""Simple MM-DD-YYYY date is caught by dob_simple."""
|
||
|
|
f = _make_file(tmp_path, "dob_simple", "Statement date: 01-15-2024\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "dob_simple" in keys, f"Expected 'dob_simple' in {keys}"
|
||
|
|
|
||
|
|
def test_dob_birthdate_label(self, tmp_path: Path):
|
||
|
|
"""Birthdate label is detected."""
|
||
|
|
f = _make_file(tmp_path, "dob_birth", "Birthdate: 05/30/1990\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "dob" in keys, f"Expected 'dob' in {keys}"
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 7. ZIP+4 DETECTION
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class TestZIPDetection:
|
||
|
|
"""ZIP+4 postal code detection."""
|
||
|
|
|
||
|
|
def test_zip4_hyphen(self, tmp_path: Path):
|
||
|
|
"""ZIP+4 format xxxxx-xxxx is detected."""
|
||
|
|
f = _make_file(tmp_path, "zip4", "ZIP: 53202-1234\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "zip" in keys, f"Expected 'zip' in {keys}"
|
||
|
|
|
||
|
|
def test_zip4_space(self, tmp_path: Path):
|
||
|
|
"""ZIP+4 with space separator is detected."""
|
||
|
|
f = _make_file(tmp_path, "zip4_space", "ZIP: 53202 1234\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "zip" in keys, f"Expected 'zip' in {keys}"
|
||
|
|
|
||
|
|
def test_zip5_only(self, tmp_path: Path):
|
||
|
|
"""5-digit ZIP is not flagged by zip pattern."""
|
||
|
|
f = _make_file(tmp_path, "zip5", "ZIP: 53202\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "zip" not in keys, "5-digit ZIP should not trigger zip pattern"
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 8. IP ADDRESS DETECTION
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class TestIPDetection:
|
||
|
|
"""IPv4 address detection."""
|
||
|
|
|
||
|
|
def test_ip_private(self, tmp_path: Path):
|
||
|
|
"""Private IP 192.168.1.1 is detected."""
|
||
|
|
f = _make_file(tmp_path, "ip_priv", "Server: 192.168.1.1\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "ip" in keys, f"Expected 'ip' in {keys}"
|
||
|
|
|
||
|
|
def test_ip_public(self, tmp_path: Path):
|
||
|
|
"""Public IP 8.8.8.8 is detected."""
|
||
|
|
f = _make_file(tmp_path, "ip_pub", "DNS: 8.8.8.8\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "ip" in keys, f"Expected 'ip' in {keys}"
|
||
|
|
|
||
|
|
def test_ip_loopback(self, tmp_path: Path):
|
||
|
|
"""Loopback 127.0.0.1 is detected."""
|
||
|
|
f = _make_file(tmp_path, "ip_loop", "Localhost: 127.0.0.1\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "ip" in keys, f"Expected 'ip' in {keys}"
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 9. LICENSE PLATE / ACCOUNT / URL DETECTION
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class TestLicenseAccountURL:
|
||
|
|
"""Driver's license, account numbers, and URLs."""
|
||
|
|
|
||
|
|
def test_license(self, tmp_path: Path):
|
||
|
|
"""Driver's License format with keyword is detected."""
|
||
|
|
# Pattern: r"\b(Driver'?s?\s*License|License\s*#|State\s*ID|DL#|SSN\s*#)[\s:#\-=]*[A-Z]{1,2}\d{5,}\b"
|
||
|
|
f = _make_file(tmp_path, "license", "Driver's License: A1234567\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "license" in keys, f"Expected 'license' in {keys}"
|
||
|
|
|
||
|
|
def test_license_drivers_keyword(self, tmp_path: Path):
|
||
|
|
"""Drivers License keyword is detected."""
|
||
|
|
f = _make_file(tmp_path, "license2", "Driver's License: A1234567\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "license" in keys, f"Expected 'license' in {keys}"
|
||
|
|
|
||
|
|
def test_account_number(self, tmp_path: Path):
|
||
|
|
"""Account number pattern is detected."""
|
||
|
|
f = _make_file(tmp_path, "account", "Account: 12345678901\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "account" in keys, f"Expected 'account' in {keys}"
|
||
|
|
|
||
|
|
def test_account_acct(self, tmp_path: Path):
|
||
|
|
"""Acct abbreviation is detected."""
|
||
|
|
f = _make_file(tmp_path, "account2", "Acct# 99887766\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "account" in keys, f"Expected 'account' in {keys}"
|
||
|
|
|
||
|
|
def test_url(self, tmp_path: Path):
|
||
|
|
"""HTTP/HTTPS URLs are detected."""
|
||
|
|
f = _make_file(tmp_path, "url", "Visit https://patient-portal.example.com\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "url" in keys, f"Expected 'url' in {keys}"
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 10. EM-DASH HANDLING
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class TestEmDashHandling:
|
||
|
|
"""SSN with em-dash vs hyphen."""
|
||
|
|
|
||
|
|
def test_emdash_vs_hyphen_both_detected(self, tmp_path: Path):
|
||
|
|
"""Both em-dash and hyphen SSN are detected as 'ssn' pattern."""
|
||
|
|
content = "SSN hyphen: 123-45-6789 | SSN em-dash: 123—45—6789\n"
|
||
|
|
f = _make_file(tmp_path, "emdash", content, ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "ssn" in keys, f"Expected 'ssn' in {keys}"
|
||
|
|
|
||
|
|
def test_emdash_only(self, tmp_path: Path):
|
||
|
|
"""SSN with only em-dash is detected."""
|
||
|
|
f = _make_file(tmp_path, "emdash_only", "Patient SSN: 999—88—7777\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
keys = [ff.pattern_key for ff in result.findings]
|
||
|
|
assert "ssn" in keys, f"Expected 'ssn' in {keys}"
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 11. JSON FIELD-NAME AWARENESS
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class TestJSONFieldAwareness:
|
||
|
|
"""Policy/plan/group_id/member_id values in JSON should not be flagged."""
|
||
|
|
|
||
|
|
def test_policy_field_not_phone(self, tmp_path: Path):
|
||
|
|
"""Policy/account-like values in JSON with excluded field names should not be flagged as phone."""
|
||
|
|
content = json.dumps({"policy": "UNH-772-441-0091", "group_id": "G-123-456-789"})
|
||
|
|
f = _make_file(tmp_path, "json_policy", content + "\n", ".json")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone_findings = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
assert len(phone_findings) == 0, f"Policy field values should not be flagged as phone: {phone_findings}"
|
||
|
|
|
||
|
|
def test_plan_field_not_phone(self, tmp_path: Path):
|
||
|
|
"""JSON field named 'plan' is not a phone."""
|
||
|
|
content = json.dumps({"plan": "BCBS-992-771-442"})
|
||
|
|
f = _make_file(tmp_path, "json_plan", content + "\n", ".json")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone_findings = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
assert len(phone_findings) == 0
|
||
|
|
|
||
|
|
def test_group_id_field_not_phone(self, tmp_path: Path):
|
||
|
|
"""JSON field named 'group_id' is not a phone."""
|
||
|
|
content = json.dumps({"group_id": "GRP-112233445"})
|
||
|
|
f = _make_file(tmp_path, "json_group", content + "\n", ".json")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone_findings = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
assert len(phone_findings) == 0
|
||
|
|
|
||
|
|
def test_member_id_field_not_phone(self, tmp_path: Path):
|
||
|
|
"""JSON field named 'member_id' is not a phone."""
|
||
|
|
content = json.dumps({"member_id": "MEM-998877665"})
|
||
|
|
f = _make_file(tmp_path, "json_member", content + "\n", ".json")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone_findings = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
assert len(phone_findings) == 0
|
||
|
|
|
||
|
|
def test_json_real_phone_not_filtered(self, tmp_path: Path):
|
||
|
|
"""Real phone in JSON is still detected even alongside policy fields."""
|
||
|
|
content = json.dumps({
|
||
|
|
"policy": "AET-772-441-0091",
|
||
|
|
"patient_phone": "312-555-1234"
|
||
|
|
})
|
||
|
|
f = _make_file(tmp_path, "json_real_phone", content + "\n", ".json")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone_findings = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
assert len(phone_findings) >= 1, "Real phone should still be detected"
|
||
|
|
|
||
|
|
def test_policy_id_field_not_phone(self, tmp_path: Path):
|
||
|
|
"""JSON field named 'policy_id' is not a phone."""
|
||
|
|
content = json.dumps({"policy_id": "POL-987654321"})
|
||
|
|
f = _make_file(tmp_path, "json_policy_id", content + "\n", ".json")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone_findings = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
assert len(phone_findings) == 0, f"policy_id field should not be flagged: {phone_findings}"
|
||
|
|
|
||
|
|
def test_plan_id_field_not_phone(self, tmp_path: Path):
|
||
|
|
"""JSON field named 'plan_id' is not a phone."""
|
||
|
|
content = json.dumps({"plan_id": "PLAN-123456789"})
|
||
|
|
f = _make_file(tmp_path, "json_plan_id", content + "\n", ".json")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone_findings = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
assert len(phone_findings) == 0, f"plan_id field should not be flagged: {phone_findings}"
|
||
|
|
|
||
|
|
def test_subscriber_field_not_phone(self, tmp_path: Path):
|
||
|
|
"""JSON field named 'subscriber' is not a phone."""
|
||
|
|
content = json.dumps({"subscriber": "SUB-112233445566"})
|
||
|
|
f = _make_file(tmp_path, "json_subscriber", content + "\n", ".json")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone_findings = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
assert len(phone_findings) == 0, f"subscriber field should not be flagged: {phone_findings}"
|
||
|
|
|
||
|
|
def test_insurance_field_not_phone(self, tmp_path: Path):
|
||
|
|
"""JSON field named 'insurance' is not a phone."""
|
||
|
|
content = json.dumps({"insurance": "INS-7766554433"})
|
||
|
|
f = _make_file(tmp_path, "json_insurance", content + "\n", ".json")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone_findings = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
assert len(phone_findings) == 0, f"insurance field should not be flagged: {phone_findings}"
|
||
|
|
|
||
|
|
def test_payer_field_not_phone(self, tmp_path: Path):
|
||
|
|
"""JSON field named 'payer' is not a phone."""
|
||
|
|
content = json.dumps({"payer": "PAYER-9988776655"})
|
||
|
|
f = _make_file(tmp_path, "json_payer", content + "\n", ".json")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone_findings = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
assert len(phone_findings) == 0, f"payer field should not be flagged: {phone_findings}"
|
||
|
|
|
||
|
|
def test_claim_field_not_phone(self, tmp_path: Path):
|
||
|
|
"""JSON field named 'claim' is not a phone."""
|
||
|
|
content = json.dumps({"claim": "CLM-12345-6789"})
|
||
|
|
f = _make_file(tmp_path, "json_claim", content + "\n", ".json")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone_findings = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
assert len(phone_findings) == 0, f"claim field should not be flagged: {phone_findings}"
|
||
|
|
|
||
|
|
def test_nested_insurance_policy_number(self, tmp_path: Path):
|
||
|
|
"""Nested insurance.policy_number field should be excluded."""
|
||
|
|
content = json.dumps({
|
||
|
|
"insurance": {
|
||
|
|
"policy_number": "AET-772-441-0091"
|
||
|
|
}
|
||
|
|
})
|
||
|
|
f = _make_file(tmp_path, "json_nested_insurance", content + "\n", ".json")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone_findings = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
assert len(phone_findings) == 0, f"Nested policy_number should be excluded: {phone_findings}"
|
||
|
|
|
||
|
|
def test_nested_group_id(self, tmp_path: Path):
|
||
|
|
"""Nested group.id field should be excluded."""
|
||
|
|
content = json.dumps({
|
||
|
|
"group": {
|
||
|
|
"id": "GRP-99887766"
|
||
|
|
}
|
||
|
|
})
|
||
|
|
f = _make_file(tmp_path, "json_nested_group", content + "\n", ".json")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone_findings = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
assert len(phone_findings) == 0, f"Nested group.id should be excluded: {phone_findings}"
|
||
|
|
|
||
|
|
def test_alpha_mixed_value_excluded_regardless_of_field(self, tmp_path: Path):
|
||
|
|
"""Values with alpha chars mixed with digits are excluded regardless of field name."""
|
||
|
|
content = json.dumps({"some_field": "XYZ-123-456-7890"})
|
||
|
|
f = _make_file(tmp_path, "json_alpha_mixed", content + "\n", ".json")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone_findings = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
assert len(phone_findings) == 0, f"Alpha-mixed values should be excluded: {phone_findings}"
|
||
|
|
|
||
|
|
def test_uppercase_field_keywords_excluded(self, tmp_path: Path):
|
||
|
|
"""Uppercase field name keywords are excluded (case-insensitive matching)."""
|
||
|
|
content = json.dumps({
|
||
|
|
"POLICY": "POL-123456789",
|
||
|
|
"PLAN": "BCBS-987654321",
|
||
|
|
"GROUP_ID": "GRP-11223344"
|
||
|
|
})
|
||
|
|
f = _make_file(tmp_path, "json_uppercase", content + "\n", ".json")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone_findings = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
assert len(phone_findings) == 0, f"Uppercase keywords should be excluded: {phone_findings}"
|
||
|
|
|
||
|
|
def test_memberid_underscore_variant(self, tmp_path: Path):
|
||
|
|
"""memberid (no underscore) variant is also excluded."""
|
||
|
|
content = json.dumps({"memberid": "MEM-554433221"})
|
||
|
|
f = _make_file(tmp_path, "json_memberid", content + "\n", ".json")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone_findings = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
assert len(phone_findings) == 0, f"memberid field should be excluded: {phone_findings}"
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 12. FILE EXTENSION FILTERING
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class TestFileExtensionFiltering:
|
||
|
|
"""Scanner only processes allowed extensions."""
|
||
|
|
|
||
|
|
def test_scannable_txt(self, tmp_path: Path):
|
||
|
|
""".txt files are scanned."""
|
||
|
|
f = _make_file(tmp_path, "ext_txt", "SSN: 123-45-6789\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
assert result.total_files_scanned == 1
|
||
|
|
assert result.total_findings >= 1
|
||
|
|
|
||
|
|
def test_scannable_csv(self, tmp_path: Path):
|
||
|
|
""".csv files are scanned."""
|
||
|
|
f = _make_file(tmp_path, "ext_csv", "ssn\n123-45-6789\n", ".csv")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
assert result.total_files_scanned == 1
|
||
|
|
assert result.total_findings >= 1
|
||
|
|
|
||
|
|
def test_scannable_json(self, tmp_path: Path):
|
||
|
|
""".json files are scanned."""
|
||
|
|
f = _make_file(tmp_path, "ext_json", '{"ssn": "123-45-6789"}\n', ".json")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
assert result.total_files_scanned == 1
|
||
|
|
assert result.total_findings >= 1
|
||
|
|
|
||
|
|
def test_scannable_xml(self, tmp_path: Path):
|
||
|
|
""".xml files are scanned."""
|
||
|
|
f = _make_file(tmp_path, "ext_xml", "<ssn>123-45-6789</ssn>\n", ".xml")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
assert result.total_files_scanned == 1
|
||
|
|
assert result.total_findings >= 1
|
||
|
|
|
||
|
|
def test_scannable_sql(self, tmp_path: Path):
|
||
|
|
""".sql files are scanned."""
|
||
|
|
f = _make_file(tmp_path, "ext_sql", "-- SSN: 123-45-6789\nSELECT * FROM patients;\n", ".sql")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
assert result.total_files_scanned == 1
|
||
|
|
|
||
|
|
def test_scannable_log(self, tmp_path: Path):
|
||
|
|
""".log files are scanned."""
|
||
|
|
f = _make_file(tmp_path, "ext_log", "User login SSN=123-45-6789\n", ".log")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
assert result.total_files_scanned == 1
|
||
|
|
|
||
|
|
def test_scannable_py(self, tmp_path: Path):
|
||
|
|
""".py files are scanned."""
|
||
|
|
f = _make_file(tmp_path, "ext_py", '# SSN: 123-45-6789\n', ".py")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
assert result.total_files_scanned == 1
|
||
|
|
|
||
|
|
def test_scannable_md(self, tmp_path: Path):
|
||
|
|
""".md files are scanned."""
|
||
|
|
f = _make_file(tmp_path, "ext_md", '## Patient SSN: 123-45-6789\n', ".md")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
assert result.total_files_scanned == 1
|
||
|
|
|
||
|
|
def test_scannable_yaml(self, tmp_path: Path):
|
||
|
|
""".yaml files are scanned."""
|
||
|
|
f = _make_file(tmp_path, "ext_yaml", "ssn: '123-45-6789'\n", ".yaml")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
assert result.total_files_scanned == 1
|
||
|
|
|
||
|
|
def test_scannable_yml(self, tmp_path: Path):
|
||
|
|
""".yml files are scanned."""
|
||
|
|
f = _make_file(tmp_path, "ext_yml", "ssn: '123-45-6789'\n", ".yml")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
assert result.total_files_scanned == 1
|
||
|
|
|
||
|
|
def test_unsupported_extension_skipped(self, tmp_path: Path):
|
||
|
|
""".png files are NOT scanned (not in SCANNABLE_EXTENSIONS)."""
|
||
|
|
f = _make_file(tmp_path, "ext_png", "SSN: 123-45-6789\n", ".png")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
assert result.total_files_scanned == 0, "PNG should be skipped"
|
||
|
|
|
||
|
|
def test_unsupported_extension_bin_skipped(self, tmp_path: Path):
|
||
|
|
""".bin files are NOT scanned."""
|
||
|
|
f = _make_file(tmp_path, "ext_bin", "SSN: 123-45-6789\n", ".bin")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
assert result.total_files_scanned == 0
|
||
|
|
|
||
|
|
def test_extension_case_insensitive(self, tmp_path: Path):
|
||
|
|
""".TXT and .CSV extensions are scanned regardless of case."""
|
||
|
|
(tmp_path / "test.TXT").write_text("SSN: 123-45-6789\n")
|
||
|
|
(tmp_path / "data.CSV").write_text("SSN,123-45-6789\n")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
assert result.total_files_scanned == 2
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 13. SEVERITY ASSIGNMENT
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class TestSeverityAssignment:
|
||
|
|
"""SSN/MRN/NPI = HIGH; email/phone/DOB = MED; zip/ip/url = LOW."""
|
||
|
|
|
||
|
|
def test_ssn_severity_high(self, tmp_path: Path):
|
||
|
|
f = _make_file(tmp_path, "sev_ssn", "SSN: 123-45-6789\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
ssn = [ff for ff in result.findings if ff.pattern_key == "ssn"]
|
||
|
|
assert len(ssn) >= 1
|
||
|
|
assert ssn[0].severity == Severity.HIGH, "SSN should be HIGH severity"
|
||
|
|
|
||
|
|
def test_mrn_severity_high(self, tmp_path: Path):
|
||
|
|
f = _make_file(tmp_path, "sev_mrn", "MRN: 77441\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
mrn = [ff for ff in result.findings if ff.pattern_key == "mrn"]
|
||
|
|
assert len(mrn) >= 1
|
||
|
|
assert mrn[0].severity == Severity.HIGH, "MRN should be HIGH severity"
|
||
|
|
|
||
|
|
def test_email_severity_medium(self, tmp_path: Path):
|
||
|
|
f = _make_file(tmp_path, "sev_email", "Email: test@example.com\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
email = [ff for ff in result.findings if ff.pattern_key == "email"]
|
||
|
|
assert len(email) >= 1
|
||
|
|
assert email[0].severity == Severity.MEDIUM, "Email should be MEDIUM severity"
|
||
|
|
|
||
|
|
def test_phone_severity_medium(self, tmp_path: Path):
|
||
|
|
f = _make_file(tmp_path, "sev_phone", "Phone: 312-555-1234\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
phone = [ff for ff in result.findings if ff.pattern_key == "phone"]
|
||
|
|
assert len(phone) >= 1
|
||
|
|
assert phone[0].severity == Severity.MEDIUM, "Phone should be MEDIUM severity"
|
||
|
|
|
||
|
|
def test_dob_severity_low(self, tmp_path: Path):
|
||
|
|
f = _make_file(tmp_path, "sev_dob", "DOB: 12/25/1985\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
dob = [ff for ff in result.findings if "dob" in ff.pattern_key]
|
||
|
|
assert len(dob) >= 1
|
||
|
|
assert dob[0].severity == Severity.LOW, "DOB should be LOW severity"
|
||
|
|
|
||
|
|
def test_zip_severity_low(self, tmp_path: Path):
|
||
|
|
f = _make_file(tmp_path, "sev_zip", "ZIP: 53202-1234\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
zip_f = [ff for ff in result.findings if ff.pattern_key == "zip"]
|
||
|
|
assert len(zip_f) >= 1
|
||
|
|
assert zip_f[0].severity == Severity.LOW, "ZIP+4 should be LOW severity"
|
||
|
|
|
||
|
|
def test_ip_severity_low(self, tmp_path: Path):
|
||
|
|
f = _make_file(tmp_path, "sev_ip", "Server: 192.168.1.1\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
ip = [ff for ff in result.findings if ff.pattern_key == "ip"]
|
||
|
|
assert len(ip) >= 1
|
||
|
|
assert ip[0].severity == Severity.LOW, "IP should be LOW severity"
|
||
|
|
|
||
|
|
def test_url_severity_low(self, tmp_path: Path):
|
||
|
|
f = _make_file(tmp_path, "sev_url", "URL: https://example.com\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
url = [ff for ff in result.findings if ff.pattern_key == "url"]
|
||
|
|
assert len(url) >= 1
|
||
|
|
assert url[0].severity == Severity.LOW, "URL should be LOW severity"
|
||
|
|
|
||
|
|
def test_account_severity_medium(self, tmp_path: Path):
|
||
|
|
f = _make_file(tmp_path, "sev_acct", "Account: 12345678901\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
acct = [ff for ff in result.findings if ff.pattern_key == "account"]
|
||
|
|
assert len(acct) >= 1
|
||
|
|
assert acct[0].severity == Severity.MEDIUM, "Account should be MEDIUM severity"
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 14. JSON OUTPUT FORMAT
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class TestJSONOutputFormat:
|
||
|
|
"""Verify scanner JSON output matches expected schema."""
|
||
|
|
|
||
|
|
def test_json_output_schema(self, tmp_path: Path):
|
||
|
|
"""ScanResult.to_dict() returns expected fields."""
|
||
|
|
f = _make_file(tmp_path, "json_schema", "SSN: 123-45-6789\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
d = result.to_dict()
|
||
|
|
|
||
|
|
assert "scanner_version" in d
|
||
|
|
assert "started_at" in d
|
||
|
|
assert "completed_at" in d
|
||
|
|
assert "root_path" in d
|
||
|
|
assert "total_files_scanned" in d
|
||
|
|
assert "files_with_findings" in d
|
||
|
|
assert "total_findings" in d
|
||
|
|
assert isinstance(d["findings"], list)
|
||
|
|
|
||
|
|
def test_json_finding_fields(self, tmp_path: Path):
|
||
|
|
"""Each finding dict has required fields."""
|
||
|
|
f = _make_file(tmp_path, "finding_fields", "MRN: 77441\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
d = result.to_dict()
|
||
|
|
|
||
|
|
for finding in d["findings"]:
|
||
|
|
assert "file" in finding
|
||
|
|
assert "pattern_key" in finding
|
||
|
|
assert "label" in finding
|
||
|
|
assert "severity" in finding
|
||
|
|
assert "line_number" in finding
|
||
|
|
assert "line_content" in finding
|
||
|
|
assert "context" in finding
|
||
|
|
assert "offset" in finding
|
||
|
|
|
||
|
|
def test_json_report_includes_metadata(self, tmp_path: Path):
|
||
|
|
"""JSON report includes version and timing metadata."""
|
||
|
|
f = _make_file(tmp_path, "json_meta", "Email: test@example.com\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
d = result.to_dict()
|
||
|
|
|
||
|
|
assert d["scanner_version"] == VERSION
|
||
|
|
assert d["started_at"] != ""
|
||
|
|
assert d["completed_at"] != ""
|
||
|
|
assert d["total_files_scanned"] >= 1
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 15. HTML REPORT GENERATION
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class TestHTMLReportGeneration:
|
||
|
|
"""Verify generate_html_report creates a non-empty HTML file."""
|
||
|
|
|
||
|
|
def test_html_report_created(self, tmp_path: Path):
|
||
|
|
"""generate_html_report writes a file that is non-empty."""
|
||
|
|
from greysec_phi_scanner.reporting.html_report import generate_html_report
|
||
|
|
|
||
|
|
scan_results = [{
|
||
|
|
"source": str(tmp_path),
|
||
|
|
"source_type": "filesystem",
|
||
|
|
"files_scanned": 1,
|
||
|
|
"findings": [
|
||
|
|
{
|
||
|
|
"type": "SSN",
|
||
|
|
"label": "SSN",
|
||
|
|
"severity": 3,
|
||
|
|
"text": "123-45-6789",
|
||
|
|
"file": str(tmp_path / "patient.txt"),
|
||
|
|
"line": 1,
|
||
|
|
"offset": 5,
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"errors": [],
|
||
|
|
"scan_duration_secs": 0.01,
|
||
|
|
}]
|
||
|
|
|
||
|
|
output_path = tmp_path / "test_report.html"
|
||
|
|
generate_html_report(
|
||
|
|
scan_results=scan_results,
|
||
|
|
output_path=output_path,
|
||
|
|
client_name="Test Client",
|
||
|
|
engagement="PHI Test Assessment",
|
||
|
|
)
|
||
|
|
|
||
|
|
assert output_path.exists(), "HTML report file should be created"
|
||
|
|
content = output_path.read_text()
|
||
|
|
assert len(content) > 0, "HTML report should not be empty"
|
||
|
|
assert "<html" in content.lower(), "Output should be valid HTML"
|
||
|
|
|
||
|
|
def test_html_report_contains_summary(self, tmp_path: Path):
|
||
|
|
"""HTML report contains finding counts and severity badges."""
|
||
|
|
from greysec_phi_scanner.reporting.html_report import generate_html_report
|
||
|
|
|
||
|
|
scan_results = [{
|
||
|
|
"source": str(tmp_path),
|
||
|
|
"source_type": "filesystem",
|
||
|
|
"files_scanned": 1,
|
||
|
|
"findings": [
|
||
|
|
{"type": "SSN", "severity": 3, "text": "123-45-6789", "file": "p.txt", "line": 1, "offset": 0},
|
||
|
|
{"type": "Email", "severity": 2, "text": "a@b.com", "file": "p.txt", "line": 2, "offset": 0},
|
||
|
|
],
|
||
|
|
"errors": [],
|
||
|
|
"scan_duration_secs": 0.01,
|
||
|
|
}]
|
||
|
|
|
||
|
|
output_path = tmp_path / "test_report2.html"
|
||
|
|
generate_html_report(
|
||
|
|
scan_results=scan_results,
|
||
|
|
output_path=output_path,
|
||
|
|
client_name="Acme Health",
|
||
|
|
engagement="PHI Audit",
|
||
|
|
)
|
||
|
|
|
||
|
|
content = output_path.read_text()
|
||
|
|
# Should contain HIGH and MEDIUM severity labels
|
||
|
|
assert "HIGH" in content, "Report should contain HIGH severity label"
|
||
|
|
assert "MEDIUM" in content or "MED" in content, "Report should contain MEDIUM label"
|
||
|
|
assert "Acme Health" in content, "Report should contain client name"
|
||
|
|
|
||
|
|
def test_html_report_empty_findings(self, tmp_path: Path):
|
||
|
|
"""HTML report handles empty findings gracefully."""
|
||
|
|
from greysec_phi_scanner.reporting.html_report import generate_html_report
|
||
|
|
|
||
|
|
scan_results = [{
|
||
|
|
"source": str(tmp_path),
|
||
|
|
"source_type": "filesystem",
|
||
|
|
"files_scanned": 1,
|
||
|
|
"findings": [],
|
||
|
|
"errors": [],
|
||
|
|
"scan_duration_secs": 0.01,
|
||
|
|
}]
|
||
|
|
|
||
|
|
output_path = tmp_path / "test_report3.html"
|
||
|
|
generate_html_report(
|
||
|
|
scan_results=scan_results,
|
||
|
|
output_path=output_path,
|
||
|
|
client_name="Empty Corp",
|
||
|
|
engagement="PHI Scan",
|
||
|
|
)
|
||
|
|
|
||
|
|
assert output_path.exists()
|
||
|
|
content = output_path.read_text()
|
||
|
|
assert len(content) > 0
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 16. FINDING DISPLAY / CONTEXT
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class TestFindingContext:
|
||
|
|
"""Findings include context radius and line content."""
|
||
|
|
|
||
|
|
def test_context_radius(self, tmp_path: Path):
|
||
|
|
"""Finding context includes surrounding text."""
|
||
|
|
f = _make_file(tmp_path, "ctx", "Patient name: John | SSN: 123-45-6789 | DOB: 01-01-1980\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
ssn_f = [ff for ff in result.findings if ff.pattern_key == "ssn"]
|
||
|
|
assert len(ssn_f) >= 1
|
||
|
|
assert "SSN" in ssn_f[0].context or "123-45-6789" in ssn_f[0].context
|
||
|
|
|
||
|
|
def test_line_number_correct(self, tmp_path: Path):
|
||
|
|
"""Line number is correctly reported."""
|
||
|
|
content = "Line 1 no data\nLine 2 SSN: 123-45-6789 here\nLine 3 done\n"
|
||
|
|
f = _make_file(tmp_path, "linenum", content, ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
ssn_f = [ff for ff in result.findings if ff.pattern_key == "ssn"]
|
||
|
|
assert len(ssn_f) >= 1
|
||
|
|
assert ssn_f[0].line_number == 2, f"Expected line 2, got {ssn_f[0].line_number}"
|
||
|
|
|
||
|
|
def test_finding_offset(self, tmp_path: Path):
|
||
|
|
"""Finding includes character offset."""
|
||
|
|
f = _make_file(tmp_path, "offset", "Prefix SSN: 123-45-6789 suffix\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
ssn_f = [ff for ff in result.findings if ff.pattern_key == "ssn"]
|
||
|
|
assert len(ssn_f) >= 1
|
||
|
|
assert ssn_f[0].offset >= 0, "Offset should be non-negative"
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 17. EDGE CASES
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class TestEdgeCases:
|
||
|
|
"""Empty files, binary-like content, large files, ignore patterns."""
|
||
|
|
|
||
|
|
def test_empty_file(self, tmp_path: Path):
|
||
|
|
"""Empty files are processed without error."""
|
||
|
|
f = _make_file(tmp_path, "empty", "", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
assert result.total_files_scanned == 1
|
||
|
|
assert result.total_findings == 0
|
||
|
|
|
||
|
|
def test_whitespace_only_file(self, tmp_path: Path):
|
||
|
|
"""Whitespace-only files are processed without error."""
|
||
|
|
f = _make_file(tmp_path, "ws_only", " \n\n \n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
assert result.total_files_scanned == 1
|
||
|
|
assert result.total_findings == 0
|
||
|
|
|
||
|
|
def test_ignore_pattern_excludes_file(self, tmp_path: Path):
|
||
|
|
"""File matching ignore pattern is not scanned; other files are scanned."""
|
||
|
|
_make_file(tmp_path, "keep_data", "SSN: 111-11-1111\n", ".txt")
|
||
|
|
_make_file(tmp_path, "skip_me", "SSN: 999-88-7777\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path), ignore_patterns={"skip_me"})
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
# skip_me.txt should be excluded; keep_data.txt should be scanned
|
||
|
|
assert result.total_files_scanned == 1, f"Expected 1, got {result.total_files_scanned}"
|
||
|
|
assert result.findings[0].file.endswith("keep_data.txt")
|
||
|
|
|
||
|
|
def test_no_findings_clean_file(self, tmp_path: Path):
|
||
|
|
"""File with no PHI produces zero findings."""
|
||
|
|
f = _make_file(tmp_path, "clean", "This file contains no PHI data.\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
assert result.total_findings == 0
|
||
|
|
assert result.files_with_findings == 0
|
||
|
|
|
||
|
|
def test_multiple_files_scanned(self, tmp_path: Path):
|
||
|
|
"""Multiple files are each scanned."""
|
||
|
|
_make_file(tmp_path, "f1", "SSN: 111-11-1111\n", ".txt")
|
||
|
|
_make_file(tmp_path, "f2", "SSN: 222-22-2222\n", ".txt")
|
||
|
|
_make_file(tmp_path, "f3", "No PHI here\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
assert result.total_files_scanned == 3
|
||
|
|
ssn_count = len([ff for ff in result.findings if ff.pattern_key == "ssn"])
|
||
|
|
assert ssn_count == 2
|
||
|
|
|
||
|
|
def test_nested_directories_scanned(self, tmp_path: Path):
|
||
|
|
"""Files in subdirectories are scanned."""
|
||
|
|
sub = tmp_path / "subdir"
|
||
|
|
sub.mkdir()
|
||
|
|
_make_file(tmp_path, "top", "Email: top@example.com\n", ".txt")
|
||
|
|
_make_file(sub, "nested", "Email: nested@example.com\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
assert result.total_files_scanned == 2
|
||
|
|
email_count = len([ff for ff in result.findings if ff.pattern_key == "email"])
|
||
|
|
assert email_count == 2
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 18. SCANNABLE_EXTENSIONS CONSTANT
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class TestScannableExtensionsConstant:
|
||
|
|
"""SCANNABLE_EXTENSIONS contains all required extensions."""
|
||
|
|
|
||
|
|
def test_required_extensions_present(self):
|
||
|
|
required = {".txt", ".csv", ".json", ".xml", ".sql", ".log", ".py", ".md", ".yaml", ".yml"}
|
||
|
|
assert required.issubset(SCANNABLE_EXTENSIONS), f"Missing: {required - SCANNABLE_EXTENSIONS}"
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 19. FINDING DISPLAY
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class TestFindingDisplay:
|
||
|
|
"""Finding.display() returns formatted string."""
|
||
|
|
|
||
|
|
def test_display_returns_string(self, tmp_path: Path):
|
||
|
|
f = _make_file(tmp_path, "disp", "MRN: 77441\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
result = scanner.scan(show_progress=False)
|
||
|
|
mrn_f = [ff for ff in result.findings if ff.pattern_key == "mrn"]
|
||
|
|
assert len(mrn_f) >= 1
|
||
|
|
disp = mrn_f[0].display()
|
||
|
|
assert isinstance(disp, str)
|
||
|
|
assert len(disp) > 0
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 20. PATTERN KEYS AND LABELS
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class TestPatternDefinitions:
|
||
|
|
"""PHI_PATTERNS contains all expected keys with required metadata."""
|
||
|
|
|
||
|
|
def test_all_patterns_have_required_fields(self):
|
||
|
|
required_meta = {"label", "severity", "description", "pattern", "context_radius"}
|
||
|
|
for key, cfg in PHI_PATTERNS.items():
|
||
|
|
missing = required_meta - set(cfg.keys())
|
||
|
|
assert not missing, f"Pattern '{key}' missing fields: {missing}"
|
||
|
|
|
||
|
|
def test_ssn_pattern_compiled(self):
|
||
|
|
assert PHI_PATTERNS["ssn"]["pattern"] is not None
|
||
|
|
|
||
|
|
def test_email_pattern_compiled(self):
|
||
|
|
assert PHI_PATTERNS["email"]["pattern"] is not None
|
||
|
|
|
||
|
|
def test_phone_pattern_compiled(self):
|
||
|
|
assert PHI_PATTERNS["phone"]["pattern"] is not None
|
||
|
|
|
||
|
|
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
# 21. REPORT FORMATS
|
||
|
|
# ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
class TestReportFormats:
|
||
|
|
"""Scanner report() method supports table and json formats."""
|
||
|
|
|
||
|
|
def test_report_table_format(self, tmp_path: Path):
|
||
|
|
f = _make_file(tmp_path, "rep_table", "SSN: 123-45-6789\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
scanner.scan(show_progress=False)
|
||
|
|
report = scanner.report(format="table")
|
||
|
|
assert isinstance(report, str)
|
||
|
|
assert "SSN" in report or "HIGH" in report
|
||
|
|
|
||
|
|
def test_report_json_format(self, tmp_path: Path):
|
||
|
|
f = _make_file(tmp_path, "rep_json", "SSN: 123-45-6789\n", ".txt")
|
||
|
|
scanner = PHIScanner(str(tmp_path))
|
||
|
|
scanner.scan(show_progress=False)
|
||
|
|
report = scanner.report(format="json")
|
||
|
|
d = json.loads(report)
|
||
|
|
assert "findings" in d
|
||
|
|
assert isinstance(d["findings"], list)
|