""" test_document_scanner.py — Tests for CPR detection in document_scanner.py. Covers: - extract_matches: context-gated CPR detection - is_valid_cpr: date validation and modulo-11 - scan_docx: CPR detection in Word documents (including table cells) - scan_xlsx: CPR detection in Excel cells with context - False-positive suppression (invoices, phone numbers, account numbers) """ import sys import tempfile from pathlib import Path import pytest sys.path.insert(0, str(Path(__file__).parent.parent)) import document_scanner as ds # ───────────────────────────────────────────────────────────────────────────── # Helpers # ───────────────────────────────────────────────────────────────────────────── def _cprs(text: str) -> list: """Return list of CPR dicts found in text via extract_matches.""" found, _ = ds.extract_matches(text, 1, "test") return found def _has_cpr(text: str) -> bool: return bool(_cprs(text)) # ───────────────────────────────────────────────────────────────────────────── # 1. Date validation — is_valid_cpr # ───────────────────────────────────────────────────────────────────────────── class TestIsValidCpr: def test_valid_date_returns_true(self): valid, _ = ds.is_valid_cpr("29", "04", "72", "1234") assert valid is True def test_invalid_month_returns_false(self): valid, _ = ds.is_valid_cpr("01", "13", "70", "1234") assert valid is False def test_invalid_day_zero_returns_false(self): valid, _ = ds.is_valid_cpr("00", "01", "70", "1234") assert valid is False def test_invalid_day_32_returns_false(self): valid, _ = ds.is_valid_cpr("32", "01", "70", "1234") assert valid is False def test_february_31_invalid(self): valid, _ = ds.is_valid_cpr("31", "02", "90", "1234") assert valid is False def test_returns_tuple_of_two(self): result = ds.is_valid_cpr("01", "01", "70", "1234") assert isinstance(result, tuple) assert len(result) == 2 def test_mod11_field_is_bool(self): _, mod11 = ds.is_valid_cpr("01", "01", "70", "1234") assert isinstance(mod11, bool) # ───────────────────────────────────────────────────────────────────────────── # 2. extract_matches — context-gated detection # ───────────────────────────────────────────────────────────────────────────── class TestExtractMatches: # ── Should detect ───────────────────────────────────────────────────────── def test_detects_cpr_with_label(self): assert _has_cpr("CPR: 290472-1234") def test_detects_cpr_uppercase_label(self): assert _has_cpr("CPR-nummer: 290472-1234") def test_detects_personnummer_keyword(self): assert _has_cpr("personnummer 010185-4321") def test_detects_no_separator(self): assert _has_cpr("cpr nummer 2904721234") def test_detects_space_separator(self): assert _has_cpr("CPR 290472 1234") def test_result_contains_formatted_field(self): cprs = _cprs("CPR: 290472-1234") assert cprs[0]["formatted"] == "290472-1234" def test_result_contains_raw_field(self): cprs = _cprs("CPR: 290472-1234") assert "raw" in cprs[0] def test_multiple_cprs_returned(self): text = "CPR: 290472-1234 og personnummer 010185-4321" cprs = _cprs(text) assert len(cprs) == 2 # ── Should NOT detect ───────────────────────────────────────────────────── def test_rejects_naked_number_without_context(self): # No context keyword and no mod-11 — should be suppressed assert not _has_cpr("2904721234") def test_rejects_phone_number_8_digits(self): assert not _has_cpr("ring 12345678 for info") def test_rejects_invoice_context(self): assert not _has_cpr("faktura nr 290472-1234") def test_rejects_part_number_context(self): assert not _has_cpr("del nr. 290472-1234") def test_rejects_invalid_date(self): # Month 13 — date invalid, should not appear assert not _has_cpr("CPR: 011370-1234") def test_empty_string(self): assert not _has_cpr("") def test_plain_prose_no_numbers(self): assert not _has_cpr("Ingen personoplysninger i denne tekst.") # ───────────────────────────────────────────────────────────────────────────── # 3. scan_docx # ───────────────────────────────────────────────────────────────────────────── class TestScanDocx: def test_detects_cpr_in_paragraph(self, docx_with_cpr): result = ds.scan_docx(docx_with_cpr) assert len(result["cprs"]) >= 1 def test_detects_multiple_cprs(self, docx_with_cpr): result = ds.scan_docx(docx_with_cpr) assert len(result["cprs"]) >= 2 def test_detects_cpr_in_table_cell(self, docx_with_cpr): result = ds.scan_docx(docx_with_cpr) # Fixture: 2 CPRs in paragraphs + 1 in a table cell (with context) assert len(result["cprs"]) >= 3 def test_no_false_positive_on_clean_doc(self, docx_no_cpr): result = ds.scan_docx(docx_no_cpr) assert result["cprs"] == [] def test_returns_cprs_key(self, docx_with_cpr): result = ds.scan_docx(docx_with_cpr) assert "cprs" in result def test_no_error_on_clean_doc(self, docx_no_cpr): result = ds.scan_docx(docx_no_cpr) assert result.get("error") is None # ───────────────────────────────────────────────────────────────────────────── # 4. scan_xlsx # ───────────────────────────────────────────────────────────────────────────── class TestScanXlsx: def test_detects_cpr_in_cell_with_context(self, xlsx_with_cpr): result = ds.scan_xlsx(xlsx_with_cpr) assert len(result["cprs"]) >= 1 def test_no_false_positive_on_account_numbers(self, xlsx_no_cpr): result = ds.scan_xlsx(xlsx_no_cpr) assert result["cprs"] == [] def test_returns_cprs_key(self, xlsx_with_cpr): result = ds.scan_xlsx(xlsx_with_cpr) assert "cprs" in result # ───────────────────────────────────────────────────────────────────────────── # 5. Binary / edge cases via cpr_detector._scan_bytes # ───────────────────────────────────────────────────────────────────────────── class TestScanBytes: def test_binary_garbage_does_not_crash(self, binary_garbage): import cpr_detector data = binary_garbage.read_bytes() result = cpr_detector._scan_bytes(data, "sample.bin") assert isinstance(result, dict) assert "cprs" in result def test_empty_bytes_returns_empty(self): import cpr_detector result = cpr_detector._scan_bytes(b"", "empty.txt") assert result["cprs"] == [] def test_txt_with_cpr_detected(self, txt_with_art9): import cpr_detector, document_scanner as ds # scan_text in document_scanner calls undefined extract_cpr_and_dates; # test the underlying extract_matches directly on the file content. text = txt_with_art9.read_text(encoding='utf-8') cprs, _ = ds.extract_matches(text, 1, 'test') assert len(cprs) >= 1 def test_docx_with_cpr_via_scan_bytes(self, docx_with_cpr): import cpr_detector data = docx_with_cpr.read_bytes() result = cpr_detector._scan_bytes(data, "sample.docx") assert len(result["cprs"]) >= 1 def test_xlsx_with_cpr_via_scan_bytes(self, xlsx_with_cpr): import cpr_detector data = xlsx_with_cpr.read_bytes() result = cpr_detector._scan_bytes(data, "sample.xlsx") assert len(result["cprs"]) >= 1 def test_unsupported_extension_does_not_crash(self): import cpr_detector result = cpr_detector._scan_bytes(b"some bytes", "file.xyz") assert isinstance(result, dict)