225 lines
9.6 KiB
Python
225 lines
9.6 KiB
Python
"""
|
|
test_document_scanner.py — Tests for CPR detection in document_scanner.py.
|
|
|
|
Covers:
|
|
- extract_matches: context-gated CPR detection
|
|
- is_valid_cpr: date validation and modulo-11
|
|
- scan_docx: CPR detection in Word documents (including table cells)
|
|
- scan_xlsx: CPR detection in Excel cells with context
|
|
- False-positive suppression (invoices, phone numbers, account numbers)
|
|
"""
|
|
import sys
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
import document_scanner as ds
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# Helpers
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
def _cprs(text: str) -> list:
|
|
"""Return list of CPR dicts found in text via extract_matches."""
|
|
found, _ = ds.extract_matches(text, 1, "test")
|
|
return found
|
|
|
|
|
|
def _has_cpr(text: str) -> bool:
|
|
return bool(_cprs(text))
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# 1. Date validation — is_valid_cpr
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
class TestIsValidCpr:
|
|
def test_valid_date_returns_true(self):
|
|
valid, _ = ds.is_valid_cpr("29", "04", "72", "1234")
|
|
assert valid is True
|
|
|
|
def test_invalid_month_returns_false(self):
|
|
valid, _ = ds.is_valid_cpr("01", "13", "70", "1234")
|
|
assert valid is False
|
|
|
|
def test_invalid_day_zero_returns_false(self):
|
|
valid, _ = ds.is_valid_cpr("00", "01", "70", "1234")
|
|
assert valid is False
|
|
|
|
def test_invalid_day_32_returns_false(self):
|
|
valid, _ = ds.is_valid_cpr("32", "01", "70", "1234")
|
|
assert valid is False
|
|
|
|
def test_february_31_invalid(self):
|
|
valid, _ = ds.is_valid_cpr("31", "02", "90", "1234")
|
|
assert valid is False
|
|
|
|
def test_returns_tuple_of_two(self):
|
|
result = ds.is_valid_cpr("01", "01", "70", "1234")
|
|
assert isinstance(result, tuple)
|
|
assert len(result) == 2
|
|
|
|
def test_mod11_field_is_bool(self):
|
|
_, mod11 = ds.is_valid_cpr("01", "01", "70", "1234")
|
|
assert isinstance(mod11, bool)
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# 2. extract_matches — context-gated detection
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
class TestExtractMatches:
|
|
|
|
# ── Should detect ─────────────────────────────────────────────────────────
|
|
|
|
def test_detects_cpr_with_label(self):
|
|
assert _has_cpr("CPR: 290472-1234")
|
|
|
|
def test_detects_cpr_uppercase_label(self):
|
|
assert _has_cpr("CPR-nummer: 290472-1234")
|
|
|
|
def test_detects_personnummer_keyword(self):
|
|
assert _has_cpr("personnummer 010185-4321")
|
|
|
|
def test_detects_no_separator(self):
|
|
assert _has_cpr("cpr nummer 2904721234")
|
|
|
|
def test_detects_space_separator(self):
|
|
assert _has_cpr("CPR 290472 1234")
|
|
|
|
def test_result_contains_formatted_field(self):
|
|
cprs = _cprs("CPR: 290472-1234")
|
|
assert cprs[0]["formatted"] == "290472-1234"
|
|
|
|
def test_result_contains_raw_field(self):
|
|
cprs = _cprs("CPR: 290472-1234")
|
|
assert "raw" in cprs[0]
|
|
|
|
def test_multiple_cprs_returned(self):
|
|
text = "CPR: 290472-1234 og personnummer 010185-4321"
|
|
cprs = _cprs(text)
|
|
assert len(cprs) == 2
|
|
|
|
# ── Should NOT detect ─────────────────────────────────────────────────────
|
|
|
|
def test_rejects_naked_number_without_context(self):
|
|
# No context keyword and no mod-11 — should be suppressed
|
|
assert not _has_cpr("2904721234")
|
|
|
|
def test_rejects_phone_number_8_digits(self):
|
|
assert not _has_cpr("ring 12345678 for info")
|
|
|
|
def test_rejects_invoice_context(self):
|
|
assert not _has_cpr("faktura nr 290472-1234")
|
|
|
|
def test_rejects_part_number_context(self):
|
|
assert not _has_cpr("del nr. 290472-1234")
|
|
|
|
def test_rejects_invalid_date(self):
|
|
# Month 13 — date invalid, should not appear
|
|
assert not _has_cpr("CPR: 011370-1234")
|
|
|
|
def test_empty_string(self):
|
|
assert not _has_cpr("")
|
|
|
|
def test_plain_prose_no_numbers(self):
|
|
assert not _has_cpr("Ingen personoplysninger i denne tekst.")
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# 3. scan_docx
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
class TestScanDocx:
|
|
|
|
def test_detects_cpr_in_paragraph(self, docx_with_cpr):
|
|
result = ds.scan_docx(docx_with_cpr)
|
|
assert len(result["cprs"]) >= 1
|
|
|
|
def test_detects_multiple_cprs(self, docx_with_cpr):
|
|
result = ds.scan_docx(docx_with_cpr)
|
|
assert len(result["cprs"]) >= 2
|
|
|
|
def test_detects_cpr_in_table_cell(self, docx_with_cpr):
|
|
result = ds.scan_docx(docx_with_cpr)
|
|
# Fixture: 2 CPRs in paragraphs + 1 in a table cell (with context)
|
|
assert len(result["cprs"]) >= 3
|
|
|
|
def test_no_false_positive_on_clean_doc(self, docx_no_cpr):
|
|
result = ds.scan_docx(docx_no_cpr)
|
|
assert result["cprs"] == []
|
|
|
|
def test_returns_cprs_key(self, docx_with_cpr):
|
|
result = ds.scan_docx(docx_with_cpr)
|
|
assert "cprs" in result
|
|
|
|
def test_no_error_on_clean_doc(self, docx_no_cpr):
|
|
result = ds.scan_docx(docx_no_cpr)
|
|
assert result.get("error") is None
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# 4. scan_xlsx
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
class TestScanXlsx:
|
|
|
|
def test_detects_cpr_in_cell_with_context(self, xlsx_with_cpr):
|
|
result = ds.scan_xlsx(xlsx_with_cpr)
|
|
assert len(result["cprs"]) >= 1
|
|
|
|
def test_no_false_positive_on_account_numbers(self, xlsx_no_cpr):
|
|
result = ds.scan_xlsx(xlsx_no_cpr)
|
|
assert result["cprs"] == []
|
|
|
|
def test_returns_cprs_key(self, xlsx_with_cpr):
|
|
result = ds.scan_xlsx(xlsx_with_cpr)
|
|
assert "cprs" in result
|
|
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# 5. Binary / edge cases via cpr_detector._scan_bytes
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
class TestScanBytes:
|
|
|
|
def test_binary_garbage_does_not_crash(self, binary_garbage):
|
|
import cpr_detector
|
|
data = binary_garbage.read_bytes()
|
|
result = cpr_detector._scan_bytes(data, "sample.bin")
|
|
assert isinstance(result, dict)
|
|
assert "cprs" in result
|
|
|
|
def test_empty_bytes_returns_empty(self):
|
|
import cpr_detector
|
|
result = cpr_detector._scan_bytes(b"", "empty.txt")
|
|
assert result["cprs"] == []
|
|
|
|
def test_txt_with_cpr_detected(self, txt_with_art9):
|
|
import cpr_detector, document_scanner as ds
|
|
# scan_text in document_scanner calls undefined extract_cpr_and_dates;
|
|
# test the underlying extract_matches directly on the file content.
|
|
text = txt_with_art9.read_text(encoding='utf-8')
|
|
cprs, _ = ds.extract_matches(text, 1, 'test')
|
|
assert len(cprs) >= 1
|
|
|
|
def test_docx_with_cpr_via_scan_bytes(self, docx_with_cpr):
|
|
import cpr_detector
|
|
data = docx_with_cpr.read_bytes()
|
|
result = cpr_detector._scan_bytes(data, "sample.docx")
|
|
assert len(result["cprs"]) >= 1
|
|
|
|
def test_xlsx_with_cpr_via_scan_bytes(self, xlsx_with_cpr):
|
|
import cpr_detector
|
|
data = xlsx_with_cpr.read_bytes()
|
|
result = cpr_detector._scan_bytes(data, "sample.xlsx")
|
|
assert len(result["cprs"]) >= 1
|
|
|
|
def test_unsupported_extension_does_not_crash(self):
|
|
import cpr_detector
|
|
result = cpr_detector._scan_bytes(b"some bytes", "file.xyz")
|
|
assert isinstance(result, dict)
|