GDPRScanner/tests/test_document_scanner.py
2026-04-11 04:38:11 +02:00

225 lines
9.6 KiB
Python

"""
test_document_scanner.py — Tests for CPR detection in document_scanner.py.
Covers:
- extract_matches: context-gated CPR detection
- is_valid_cpr: date validation and modulo-11
- scan_docx: CPR detection in Word documents (including table cells)
- scan_xlsx: CPR detection in Excel cells with context
- False-positive suppression (invoices, phone numbers, account numbers)
"""
import sys
import tempfile
from pathlib import Path
import pytest
sys.path.insert(0, str(Path(__file__).parent.parent))
import document_scanner as ds
# ─────────────────────────────────────────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────────────────────────────────────────
def _cprs(text: str) -> list:
"""Return list of CPR dicts found in text via extract_matches."""
found, _ = ds.extract_matches(text, 1, "test")
return found
def _has_cpr(text: str) -> bool:
return bool(_cprs(text))
# ─────────────────────────────────────────────────────────────────────────────
# 1. Date validation — is_valid_cpr
# ─────────────────────────────────────────────────────────────────────────────
class TestIsValidCpr:
def test_valid_date_returns_true(self):
valid, _ = ds.is_valid_cpr("29", "04", "72", "1234")
assert valid is True
def test_invalid_month_returns_false(self):
valid, _ = ds.is_valid_cpr("01", "13", "70", "1234")
assert valid is False
def test_invalid_day_zero_returns_false(self):
valid, _ = ds.is_valid_cpr("00", "01", "70", "1234")
assert valid is False
def test_invalid_day_32_returns_false(self):
valid, _ = ds.is_valid_cpr("32", "01", "70", "1234")
assert valid is False
def test_february_31_invalid(self):
valid, _ = ds.is_valid_cpr("31", "02", "90", "1234")
assert valid is False
def test_returns_tuple_of_two(self):
result = ds.is_valid_cpr("01", "01", "70", "1234")
assert isinstance(result, tuple)
assert len(result) == 2
def test_mod11_field_is_bool(self):
_, mod11 = ds.is_valid_cpr("01", "01", "70", "1234")
assert isinstance(mod11, bool)
# ─────────────────────────────────────────────────────────────────────────────
# 2. extract_matches — context-gated detection
# ─────────────────────────────────────────────────────────────────────────────
class TestExtractMatches:
# ── Should detect ─────────────────────────────────────────────────────────
def test_detects_cpr_with_label(self):
assert _has_cpr("CPR: 290472-1234")
def test_detects_cpr_uppercase_label(self):
assert _has_cpr("CPR-nummer: 290472-1234")
def test_detects_personnummer_keyword(self):
assert _has_cpr("personnummer 010185-4321")
def test_detects_no_separator(self):
assert _has_cpr("cpr nummer 2904721234")
def test_detects_space_separator(self):
assert _has_cpr("CPR 290472 1234")
def test_result_contains_formatted_field(self):
cprs = _cprs("CPR: 290472-1234")
assert cprs[0]["formatted"] == "290472-1234"
def test_result_contains_raw_field(self):
cprs = _cprs("CPR: 290472-1234")
assert "raw" in cprs[0]
def test_multiple_cprs_returned(self):
text = "CPR: 290472-1234 og personnummer 010185-4321"
cprs = _cprs(text)
assert len(cprs) == 2
# ── Should NOT detect ─────────────────────────────────────────────────────
def test_rejects_naked_number_without_context(self):
# No context keyword and no mod-11 — should be suppressed
assert not _has_cpr("2904721234")
def test_rejects_phone_number_8_digits(self):
assert not _has_cpr("ring 12345678 for info")
def test_rejects_invoice_context(self):
assert not _has_cpr("faktura nr 290472-1234")
def test_rejects_part_number_context(self):
assert not _has_cpr("del nr. 290472-1234")
def test_rejects_invalid_date(self):
# Month 13 — date invalid, should not appear
assert not _has_cpr("CPR: 011370-1234")
def test_empty_string(self):
assert not _has_cpr("")
def test_plain_prose_no_numbers(self):
assert not _has_cpr("Ingen personoplysninger i denne tekst.")
# ─────────────────────────────────────────────────────────────────────────────
# 3. scan_docx
# ─────────────────────────────────────────────────────────────────────────────
class TestScanDocx:
def test_detects_cpr_in_paragraph(self, docx_with_cpr):
result = ds.scan_docx(docx_with_cpr)
assert len(result["cprs"]) >= 1
def test_detects_multiple_cprs(self, docx_with_cpr):
result = ds.scan_docx(docx_with_cpr)
assert len(result["cprs"]) >= 2
def test_detects_cpr_in_table_cell(self, docx_with_cpr):
result = ds.scan_docx(docx_with_cpr)
# Fixture: 2 CPRs in paragraphs + 1 in a table cell (with context)
assert len(result["cprs"]) >= 3
def test_no_false_positive_on_clean_doc(self, docx_no_cpr):
result = ds.scan_docx(docx_no_cpr)
assert result["cprs"] == []
def test_returns_cprs_key(self, docx_with_cpr):
result = ds.scan_docx(docx_with_cpr)
assert "cprs" in result
def test_no_error_on_clean_doc(self, docx_no_cpr):
result = ds.scan_docx(docx_no_cpr)
assert result.get("error") is None
# ─────────────────────────────────────────────────────────────────────────────
# 4. scan_xlsx
# ─────────────────────────────────────────────────────────────────────────────
class TestScanXlsx:
def test_detects_cpr_in_cell_with_context(self, xlsx_with_cpr):
result = ds.scan_xlsx(xlsx_with_cpr)
assert len(result["cprs"]) >= 1
def test_no_false_positive_on_account_numbers(self, xlsx_no_cpr):
result = ds.scan_xlsx(xlsx_no_cpr)
assert result["cprs"] == []
def test_returns_cprs_key(self, xlsx_with_cpr):
result = ds.scan_xlsx(xlsx_with_cpr)
assert "cprs" in result
# ─────────────────────────────────────────────────────────────────────────────
# 5. Binary / edge cases via cpr_detector._scan_bytes
# ─────────────────────────────────────────────────────────────────────────────
class TestScanBytes:
def test_binary_garbage_does_not_crash(self, binary_garbage):
import cpr_detector
data = binary_garbage.read_bytes()
result = cpr_detector._scan_bytes(data, "sample.bin")
assert isinstance(result, dict)
assert "cprs" in result
def test_empty_bytes_returns_empty(self):
import cpr_detector
result = cpr_detector._scan_bytes(b"", "empty.txt")
assert result["cprs"] == []
def test_txt_with_cpr_detected(self, txt_with_art9):
import cpr_detector, document_scanner as ds
# scan_text in document_scanner calls undefined extract_cpr_and_dates;
# test the underlying extract_matches directly on the file content.
text = txt_with_art9.read_text(encoding='utf-8')
cprs, _ = ds.extract_matches(text, 1, 'test')
assert len(cprs) >= 1
def test_docx_with_cpr_via_scan_bytes(self, docx_with_cpr):
import cpr_detector
data = docx_with_cpr.read_bytes()
result = cpr_detector._scan_bytes(data, "sample.docx")
assert len(result["cprs"]) >= 1
def test_xlsx_with_cpr_via_scan_bytes(self, xlsx_with_cpr):
import cpr_detector
data = xlsx_with_cpr.read_bytes()
result = cpr_detector._scan_bytes(data, "sample.xlsx")
assert len(result["cprs"]) >= 1
def test_unsupported_extension_does_not_crash(self):
import cpr_detector
result = cpr_detector._scan_bytes(b"some bytes", "file.xyz")
assert isinstance(result, dict)