From 2254e00481d16774d50e348c18dc241bd020915a Mon Sep 17 00:00:00 2001
From: StyxX65 <150797939+StyxX65@users.noreply.github.com>
Date: Sat, 25 Apr 2026 19:33:28 +0200
Subject: [PATCH] =?UTF-8?q?recap:=20Added=20email=20and=20phone=20number?=
 =?UTF-8?q?=20detection=20as=20opt-in=20scan=20options=20across=20all=20th?=
 =?UTF-8?q?ree=20engines,=20plus=20translation=20=20=20=20=20=20=20=20fixe?=
 =?UTF-8?q?s.=20Both=20CHANGELOG=20and=20SUGGESTIONS=20are=20updated=20?=
 =?UTF-8?q?=E2=80=94=20everything=20is=20committed=20and=20ready=20to=20te?=
 =?UTF-8?q?st.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 CHANGELOG.md          |   2 +
 SUGGESTIONS.md        |  11 ++++
 cpr_detector.py       | 130 +++++++++++++++++++++++++++++++++++-------
 gdpr_db.py            |   9 ++-
 lang/da.json          |   6 ++
 lang/de.json          |   6 ++
 lang/en.json          |   6 ++
 routes/google_scan.py |  18 +++++-
 scan_engine.py        |  60 ++++++++++++++-----
 static/js/profiles.js |  14 +++++
 static/js/results.js  |   8 ++-
 static/js/scan.js     |   4 ++
 static/style.css      |   6 ++
 templates/index.html  |  16 ++++++
 14 files changed, 254 insertions(+), 42 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 864696e..12b0a89 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,8 @@ Version numbers follow [Semantic Versioning](https://semver.org/spec/v2.0.0.html
 
 ### Added
 
+- **Email address and Danish phone number detection** — all three scan engines (M365, Google Workspace, local/SMB/SFTP) can now flag files and messages containing email addresses or Danish phone numbers in addition to CPR numbers. Detection is opt-in per profile: two new toggle options **Scan for email addresses** and **Scan for phone numbers** (default off) appear in the scan options panel and profile editor. When enabled, matches are stored as `email_count` / `phone_count` on each DB row and surfaced as colour-coded badges in list view, grid view, and the preview panel. Email regex requires a structurally valid address (`local@domain.tld`); phone regex covers 8-digit Danish numbers with optional `+45`/`0045` prefix and common spacing patterns. Both are deduplicated before counting. Requires DB migration (adds two INTEGER columns to `flagged_items`; applied automatically on first startup via `_MIGRATIONS`).
+
 - **SFTP as a 4th file connector** — SFTP servers can now be added as file sources alongside local folders, SMB shares, and cloud sources. A new `SFTPScanner` class in `sftp_connector.py` implements the same `iter_files()` interface as `FileScanner`, so `run_file_scan()`, SSE broadcasting, DB persistence, card building, scheduled scans, and exports work without changes. Supports password auth and SSH private key auth (RSA, Ed25519, ECDSA, DSS); passphrases stored in the OS keychain. Key files uploaded via `POST /api/file_sources/upload_key` and stored in `~/.gdprscanner/sftp_keys/` with `chmod 600`. SFTP sources appear with a 🔒 icon in the sources panel. Requires `paramiko>=3.4` (optional — scanner falls back gracefully if not installed). New source-type selector (Local / Network (SMB) / SFTP) replaces the SMB path-prefix auto-detection in the add-source form.
 
 - **`POST /api/file_sources/upload_key`** — new endpoint that validates and stores an SSH private key file, returning a `key_path` for use in the source definition.
diff --git a/SUGGESTIONS.md b/SUGGESTIONS.md
index 5e078d4..98a6ad5 100644
--- a/SUGGESTIONS.md
+++ b/SUGGESTIONS.md
@@ -350,3 +350,14 @@ Write redacted copies of flagged files with CPR numbers replaced by `XXX XXXX-XX
 ### Email notification on scan completion (non-scheduled) ✅
 
 Auto-email now fires on manual scans when **Email report after manual scan** is enabled in Settings → Email report. Toggle stored as `auto_email_manual` in `smtp.json`. Implemented in `routes/scan.py` — `_maybe_send_auto_email()` is called from the `_run()` thread after `run_scan()` returns. Same Graph-first → SMTP-fallback pattern as scheduled scans. Only fires when there are flagged items and at least one recipient is configured.
+
+### Phase 2 PII: name-based roster lookup
+
+Flag documents containing the full names of students or staff — even when no CPR is present. Implementation outline:
+
+1. **Roster source** — pull names from the M365 directory (`/users?$select=displayName`), the GWS directory (`admin.list_users`), or a user-uploaded CSV. Store as a flat list of `(first, last)` pairs, minimum length threshold (~5 chars per part) to suppress common first-name noise.
+2. **Multi-pattern search** — build an Aho-Corasick automaton from the roster at scan start (`pyahocorasick`, ~50 KB, optional dep). Run each extracted text through the automaton; a hit qualifies only when the match falls on a word boundary and both first + last name appear within a configurable window (e.g. 100 characters apart).
+3. **Integration** — same `_find_emails_phones`-style helper in `cpr_detector.py`; roster loaded once per scan run and passed as a parameter. New `name_count` column in `flagged_items` (DB migration). New `name-badge` in the UI. Opt-in profile toggle like `scan_emails`.
+4. **NER fallback** — optionally run `spaCy` `da_core_news_sm` (~200 MB) when no roster is available to detect PERSON entities. Much higher false-positive rate; only useful as a discovery tool.
+
+**Why deferred:** requires a roster-management UI (upload CSV, choose directory source, refresh cadence), and false-positive rate depends heavily on roster quality. Name-only matches also carry lower legal weight than CPR hits. Implement after a school explicitly requests it.
diff --git a/cpr_detector.py b/cpr_detector.py
index 2095948..1b60645 100644
--- a/cpr_detector.py
+++ b/cpr_detector.py
@@ -22,6 +22,7 @@ from __future__ import annotations
 import base64
 import hashlib
 import io
+import re
 import tempfile
 import threading
 from pathlib import Path
@@ -505,55 +506,139 @@ def _detect_photo_faces(content: bytes, filename: str) -> int:
         return 0
 
 
+_EMAIL_RE = re.compile(
+    r'\b[a-zA-Z0-9][a-zA-Z0-9._%+\-]*@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}\b'
+)
+_PHONE_RE = re.compile(
+    r'(?:'
+    r'(?:\+45|0045)[\s\-]?[2-9]\d{3}[\s\-]?\d{4}'      # +45/0045 DDDD DDDD
+    r'|(?:\+45|0045)[\s\-]?[2-9]\d(?:[\s\-]\d{2}){3}'  # +45/0045 DD DD DD DD
+    r'|\b[2-9]\d{7}\b'                                    # 8 consecutive digits
+    r'|\b[2-9]\d{3}[\s\-]\d{4}\b'                        # DDDD DDDD
+    r'|\b[2-9]\d(?:[\s\-]\d{2}){3}\b'                    # DD DD DD DD
+    r')'
+)
+
+
+def _extract_text_from_bytes(content: bytes, filename: str) -> str:
+    """Extract plain text from file bytes for email/phone pattern matching.
+
+    Returns empty string for binary media files (photos, video, audio) and
+    on any parse error — callers must never raise from this function.
+    """
+    ext = Path(filename).suffix.lower()
+    try:
+        if ext in {".txt", ".csv", ".eml", ".msg"}:
+            return content.decode("utf-8", errors="replace")
+        if ext in {".docx", ".doc"}:
+            from docx import Document as _Doc
+            doc = _Doc(io.BytesIO(content))
+            parts = [p.text for p in doc.paragraphs]
+            for tbl in doc.tables:
+                for row in tbl.rows:
+                    for cell in row.cells:
+                        parts.append(cell.text)
+            return "\n".join(parts)
+        if ext in {".xlsx", ".xlsm"}:
+            import openpyxl as _xl
+            wb = _xl.load_workbook(io.BytesIO(content), read_only=True, data_only=True)
+            parts = [
+                str(cell.value)
+                for ws in wb.worksheets
+                for row in ws.iter_rows()
+                for cell in row
+                if cell.value is not None
+            ]
+            wb.close()
+            return " ".join(parts)
+        if ext == ".pdf":
+            import pdfplumber as _pp
+            with _pp.open(io.BytesIO(content)) as pdf:
+                parts = [p.extract_text() or "" for p in pdf.pages]
+            return "\n".join(parts)
+    except Exception:
+        pass
+    if ext not in PHOTO_EXTS | VIDEO_EXTS | AUDIO_EXTS:
+        try:
+            return content.decode("utf-8", errors="replace")
+        except Exception:
+            pass
+    return ""
+
+
+def _find_emails_phones(text: str) -> dict:
+    """Extract unique email addresses and Danish phone numbers from text.
+
+    Returns {"emails": [{"formatted": str}, ...], "phones": [{"formatted": str}, ...]}.
+    Phones are normalised to digit-only strings (preserving a leading '+').
+    """
+    if not text:
+        return {"emails": [], "phones": []}
+    emails = list(dict.fromkeys(m.group(0).lower() for m in _EMAIL_RE.finditer(text)))
+    phones = list(dict.fromkeys(
+        ('+' + re.sub(r'[\s\-]', '', m.group(0)[1:]) if m.group(0).lstrip().startswith('+')
+         else re.sub(r'[\s\-]', '', m.group(0)))
+        for m in _PHONE_RE.finditer(text)
+    ))
+    return {
+        "emails": [{"formatted": e} for e in emails],
+        "phones": [{"formatted": p} for p in phones],
+    }
+
+
 def _scan_bytes(content: bytes, filename: str, poppler_path=None) -> dict:
-    """Scan raw bytes for CPRs. Returns scanner result dict."""
+    """Scan raw bytes for CPRs, emails, and phone numbers. Returns result dict."""
     if not SCANNER_OK:
-        return {"cprs": [], "dates": [], "error": "scanner not available"}
+        return {"cprs": [], "dates": [], "emails": [], "phones": [], "error": "scanner not available"}
     ext = Path(filename).suffix.lower()
     with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
         tmp.write(content)
         tmp_path = Path(tmp.name)
+    result: dict = {"cprs": [], "dates": []}
     try:
         if ext == ".pdf":
             # Check if the PDF has a text layer before running full scan_pdf.
             # Image-only PDFs (scanned documents) have no text and would trigger
             # Tesseract OCR subprocesses that hang indefinitely on some files.
             try:
-                import pdfplumber as _pp, io as _io
-                with _pp.open(_io.BytesIO(content)) as _pdf:
+                import pdfplumber as _pp
+                with _pp.open(io.BytesIO(content)) as _pdf:
                     has_text = any(ds.is_text_page(p) for p in _pdf.pages)
                 if not has_text:
-                    return {"cprs": [], "dates": []}  # image-only PDF — no CPRs possible
+                    return {"cprs": [], "dates": [], "emails": [], "phones": []}
             except Exception:
                 pass  # if pdfplumber fails, fall through to full scan_pdf
-            return ds.scan_pdf(tmp_path, poppler_path=poppler_path)
+            result = ds.scan_pdf(tmp_path, poppler_path=poppler_path)
         elif ext in {".docx", ".doc"}:
-            return ds.scan_docx(tmp_path)
+            result = ds.scan_docx(tmp_path)
         elif ext in {".xlsx", ".xlsm"}:
-            return ds.scan_xlsx(tmp_path)
+            result = ds.scan_xlsx(tmp_path)
         elif ext == ".csv":
-            return ds.scan_csv(tmp_path)
+            result = ds.scan_csv(tmp_path)
         elif ext == ".txt":
             text = content.decode("utf-8", errors="replace")
             cprs, dates = ds.extract_matches(text, 1, "text")
-            return {"cprs": cprs, "dates": dates}
+            result = {"cprs": cprs, "dates": dates}
         elif ext in {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp"}:
-            return ds.scan_image(tmp_path)
+            result = ds.scan_image(tmp_path)
         else:
-            # Try plain text
             try:
                 text = content.decode("utf-8", errors="replace")
                 cprs, dates = ds.extract_matches(text, 1, "text")
-                return {"cprs": cprs, "dates": dates}
+                result = {"cprs": cprs, "dates": dates}
             except Exception:
-                return {"cprs": [], "dates": []}
+                pass
     except Exception as e:
-        return {"cprs": [], "dates": [], "error": str(e)}
+        result = {"cprs": [], "dates": [], "error": str(e)}
     finally:
         try:
             tmp_path.unlink()
         except Exception:
             pass
+    ep = _find_emails_phones(_extract_text_from_bytes(content, filename))
+    result["emails"] = ep["emails"]
+    result["phones"] = ep["phones"]
+    return result
 
 def _worker_scan_pdf(pdf_path_str: str, result_q) -> None:
     """Worker executed in a spawned subprocess — must be a module-level function."""
@@ -607,19 +692,22 @@ def _scan_bytes_timeout(content: bytes, filename: str, timeout: int = 60) -> dic
 
 
 def _scan_text_direct(text: str) -> dict:
-    """Scan a plain text string for CPRs using extract_matches.
-    
+    """Scan a plain text string for CPRs, emails, and phone numbers.
+
     Uses ds.extract_matches() directly rather than ds.scan_text() because
     scan_text() calls extract_cpr_and_dates() which is not defined in
     document_scanner.py (pre-existing bug).
     """
-    if not SCANNER_OK or not text:
-        return {"cprs": [], "dates": []}
+    if not text:
+        return {"cprs": [], "dates": [], "emails": [], "phones": []}
+    ep = _find_emails_phones(text)
+    if not SCANNER_OK:
+        return {"cprs": [], "dates": [], **ep}
     try:
         cprs, dates = ds.extract_matches(text, 1, "text")
-        return {"cprs": cprs, "dates": dates}
+        return {"cprs": cprs, "dates": dates, **ep}
     except Exception:
-        return {"cprs": [], "dates": []}
+        return {"cprs": [], "dates": [], **ep}
 
 def _html_esc(s: str) -> str:
     """HTML-escape a string for safe inline embedding."""
diff --git a/gdpr_db.py b/gdpr_db.py
index aa4ce59..dd67545 100644
--- a/gdpr_db.py
+++ b/gdpr_db.py
@@ -200,6 +200,8 @@ _MIGRATIONS: list[tuple[int, str]] = [
     (4, "ALTER TABLE flagged_items ADD COLUMN face_count INTEGER NOT NULL DEFAULT 0"),
     (5, "ALTER TABLE flagged_items ADD COLUMN exif_json TEXT NOT NULL DEFAULT '{}'"),
     (6, "ALTER TABLE flagged_items ADD COLUMN full_path TEXT NOT NULL DEFAULT ''"),
+    (8, "ALTER TABLE flagged_items ADD COLUMN email_count INTEGER NOT NULL DEFAULT 0"),
+    (9, "ALTER TABLE flagged_items ADD COLUMN phone_count INTEGER NOT NULL DEFAULT 0"),
     (7, """CREATE TABLE IF NOT EXISTS schedule_runs (
         id          INTEGER PRIMARY KEY AUTOINCREMENT,
         started_at  REAL    NOT NULL,
@@ -311,8 +313,9 @@ class ScanDB:
                (id, scan_id, name, source, source_type, account_id, folder,
                 url, drive_id, size_kb, modified, cpr_count, risk,
                 thumb_b64, thumb_mime, attachments, user_role, transfer_risk,
-                special_category, face_count, exif_json, full_path, scanned_at)
-               VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
+                special_category, face_count, exif_json, full_path,
+                email_count, phone_count, scanned_at)
+               VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
             (
                 card.get("id", ""),
                 scan_id,
@@ -336,6 +339,8 @@ class ScanDB:
                 card.get("face_count", 0),
                 json.dumps(card.get("exif", {})),
                 card.get("full_path", ""),
+                card.get("email_count", 0),
+                card.get("phone_count", 0),
                 now,
             ),
         )
diff --git a/lang/da.json b/lang/da.json
index 26d0ad3..a09e30b 100644
--- a/lang/da.json
+++ b/lang/da.json
@@ -570,6 +570,12 @@
   "m365_opt_skip_gps": "Ignorer GPS i billeder",
   "m365_opt_skip_gps_hint": "Billeder med GPS-koordinater flagges ikke — nyttigt ved elevscanninger, hvor smartphones indlejrer placering i alle fotos.",
   "m365_opt_min_cpr": "Min. CPR-antal pr. fil",
+  "m365_opt_scan_emails": "Søg efter e-mailadresser",
+  "m365_opt_scan_emails_hint": "Flagger filer med e-mailadresser. Slået fra som standard — e-mailadresser er meget almindelige og kan give mange resultater.",
+  "m365_opt_scan_phones": "Søg efter telefonnumre",
+  "m365_opt_scan_phones_hint": "Flagger filer med danske telefonnumre (8 cifre). Nyttigt til at finde kontaktlister og forældrekorrespondance.",
+  "m365_badge_emails": "e-mail",
+  "m365_badge_phones": "tlf.",
   "m365_opt_min_cpr_hint": "Filer med færre distinkte CPR-numre end denne tærskel rapporteres ikke. Sæt til 2 for at undgå falske positive, når elever har egne CPR-numre i filer.",
   "m365_filter_photo_only": "📷 Billeder / biometrisk",
   "m365_filter_all_roles": "Alle roller",
diff --git a/lang/de.json b/lang/de.json
index f647239..3e15813 100644
--- a/lang/de.json
+++ b/lang/de.json
@@ -570,6 +570,12 @@
   "m365_opt_skip_gps": "GPS in Bildern ignorieren",
   "m365_opt_skip_gps_hint": "Bilder mit GPS-Koordinaten werden nicht markiert — nützlich beim Scannen von Schüler-Konten, deren Smartphones Standort in jedes Foto einbetten.",
   "m365_opt_min_cpr": "Min. CPR-Anzahl pro Datei",
+  "m365_opt_scan_emails": "E-Mail-Adressen scannen",
+  "m365_opt_scan_emails_hint": "Markiert Dateien mit E-Mail-Adressen. Standardmäßig deaktiviert — E-Mail-Adressen sind sehr häufig und können viele Treffer erzeugen.",
+  "m365_opt_scan_phones": "Telefonnummern scannen",
+  "m365_opt_scan_phones_hint": "Markiert Dateien mit dänischen Telefonnummern (8 Ziffern). Nützlich zum Auffinden von Kontaktlisten.",
+  "m365_badge_emails": "E-Mail",
+  "m365_badge_phones": "Tel.",
   "m365_opt_min_cpr_hint": "Dateien mit weniger eindeutigen CPR-Nummern als dieser Schwellenwert werden nicht gemeldet. Auf 2 setzen, um Falsch-Positive zu vermeiden, wenn Schüler eigene CPR-Nummern in Dateien haben.",
   "m365_filter_photo_only": "📷 Fotos / biometrisch",
   "m365_filter_all_roles": "Alle Rollen",
diff --git a/lang/en.json b/lang/en.json
index 98e6c04..c12ae78 100644
--- a/lang/en.json
+++ b/lang/en.json
@@ -570,6 +570,12 @@
   "m365_opt_skip_gps": "Ignore GPS in images",
   "m365_opt_skip_gps_hint": "Images with GPS coordinates are not flagged — useful when scanning students whose smartphones embed location in every photo.",
   "m365_opt_min_cpr": "Min. CPR count per file",
+  "m365_opt_scan_emails": "Scan for email addresses",
+  "m365_opt_scan_emails_hint": "Flags files that contain email addresses. Off by default — email addresses are very common and may produce many results.",
+  "m365_opt_scan_phones": "Scan for phone numbers",
+  "m365_opt_scan_phones_hint": "Flags files containing Danish phone numbers (8 digits). Useful for finding contact lists and parent correspondence.",
+  "m365_badge_emails": "email",
+  "m365_badge_phones": "phone",
   "m365_opt_min_cpr_hint": "Files with fewer distinct CPR numbers than this threshold are not reported. Set to 2 to avoid false positives when students have their own CPR in documents.",
   "m365_filter_photo_only": "📷 Photos / biometric",
   "m365_filter_all_roles": "All roles",
diff --git a/routes/google_scan.py b/routes/google_scan.py
index 0577085..80da589 100644
--- a/routes/google_scan.py
+++ b/routes/google_scan.py
@@ -141,6 +141,8 @@ def _run_google_scan(options: dict):
     scan_body     = bool(scan_opts.get("scan_body",        True))
     scan_att      = bool(scan_opts.get("scan_attachments", True))
     delta_enabled = bool(scan_opts.get("delta", False))
+    scan_emails   = bool(scan_opts.get("scan_emails",  False))
+    scan_phones   = bool(scan_opts.get("scan_phones",  False))
 
     from checkpoint import _load_delta_tokens, _save_delta_tokens
     _drive_delta_tokens: dict = _load_delta_tokens() if delta_enabled else {}
@@ -212,6 +214,8 @@ def _run_google_scan(options: dict):
             "source":       item_meta.get("_source", ""),
             "source_type":  item_meta.get("_source_type", ""),
             "cpr_count":    len(cprs),
+            "email_count":  item_meta.get("_email_count", 0),
+            "phone_count":  item_meta.get("_phone_count", 0),
             "url":          item_meta.get("_url", ""),
             "size_kb":      round(item_meta.get("size", 0) / 1024, 1),
             "modified":     (item_meta.get("lastModifiedDateTime") or item_meta.get("receivedDateTime") or "")[:10],
@@ -276,9 +280,13 @@ def _run_google_scan(options: dict):
                     except Exception as e:
                         broadcast("scan_error", {"file": meta.get("name", ""), "error": str(e)})
                         continue
-                    cprs      = result.get("cprs", [])
+                    cprs       = result.get("cprs", [])
                     pii_counts = result.get("pii_counts")
-                    if cprs or (pii_counts and any(pii_counts.values())):
+                    _em = list(dict.fromkeys(e["formatted"] for e in result.get("emails", []))) if scan_emails else []
+                    _ph = list(dict.fromkeys(p["formatted"] for p in result.get("phones", []))) if scan_phones else []
+                    if cprs or (pii_counts and any(pii_counts.values())) or _em or _ph:
+                        meta["_email_count"] = len(_em)
+                        meta["_phone_count"] = len(_ph)
                         _broadcast_card(meta, cprs, pii_counts)
             except GoogleError as e:
                 broadcast("scan_error", {"file": f"Gmail/{user_email}", "error": str(e)})
@@ -336,7 +344,11 @@ def _run_google_scan(options: dict):
                         continue
                     cprs       = result.get("cprs", [])
                     pii_counts = result.get("pii_counts")
-                    if cprs or (pii_counts and any(pii_counts.values())):
+                    _em = list(dict.fromkeys(e["formatted"] for e in result.get("emails", []))) if scan_emails else []
+                    _ph = list(dict.fromkeys(p["formatted"] for p in result.get("phones", []))) if scan_phones else []
+                    if cprs or (pii_counts and any(pii_counts.values())) or _em or _ph:
+                        meta["_email_count"] = len(_em)
+                        meta["_phone_count"] = len(_ph)
                         _broadcast_card(meta, cprs, pii_counts)
             except GoogleError as e:
                 broadcast("scan_error", {"file": f"Drive/{user_email}", "error": str(e)})
diff --git a/scan_engine.py b/scan_engine.py
index ee9c786..080b61f 100644
--- a/scan_engine.py
+++ b/scan_engine.py
@@ -182,6 +182,8 @@ def run_file_scan(source: dict):
     scan_photos     = bool(source.get("scan_photos", False))
     skip_gps_images = bool(source.get("skip_gps_images", False))
     min_cpr_count   = max(1, int(source.get("min_cpr_count", 1)))
+    scan_emails     = bool(source.get("scan_emails",  False))
+    scan_phones     = bool(source.get("scan_phones",  False))
     max_mb          = int(source.get("max_file_mb", 50))
 
     if source_kind == "sftp":
@@ -268,7 +270,9 @@ def run_file_scan(source: dict):
                     broadcast("scan_error", {"file": rel_path, "error": str(e)})
                     continue
 
-            cprs = result.get("cprs", [])
+            cprs   = result.get("cprs", [])
+            emails = result.get("emails", []) if scan_emails else []
+            phones = result.get("phones", []) if scan_phones else []
 
             # Photo / biometric scan + EXIF/video/audio metadata extraction
             _face_count = 0
@@ -283,13 +287,15 @@ def run_file_scan(source: dict):
                 _exif = _extract_audio_metadata(content, rel_path)
 
             # Apply filters: distinct CPR threshold and GPS suppression
-            _distinct_cprs = list(dict.fromkeys(c["formatted"] for c in cprs))
-            _cpr_qualifies = len(_distinct_cprs) >= min_cpr_count
-            _exif_has_pii  = _exif.get("has_pii") and (
+            _distinct_cprs   = list(dict.fromkeys(c["formatted"] for c in cprs))
+            _cpr_qualifies   = len(_distinct_cprs) >= min_cpr_count
+            _distinct_emails = list(dict.fromkeys(e["formatted"] for e in emails))
+            _distinct_phones = list(dict.fromkeys(p["formatted"] for p in phones))
+            _exif_has_pii    = _exif.get("has_pii") and (
                 not skip_gps_images or bool(_exif.get("pii_fields") or _exif.get("author"))
             )
 
-            if not (_cpr_qualifies and cprs) and _face_count == 0 and not _exif_has_pii:
+            if not (_cpr_qualifies and cprs) and not _distinct_emails and not _distinct_phones and _face_count == 0 and not _exif_has_pii:
                 continue
 
             # Build card metadata
@@ -325,6 +331,8 @@ def run_file_scan(source: dict):
                 "source":       label,
                 "source_type":  source_type,
                 "cpr_count":    len(cprs),
+                "email_count":  len(_distinct_emails),
+                "phone_count":  len(_distinct_phones),
                 "url":          "",
                 "size_kb":      meta["size_kb"],
                 "modified":     meta["modified"],
@@ -437,6 +445,8 @@ def run_scan(options: dict):
     scan_photos    = bool(scan_opts.get("scan_photos", False))  # biometric photo scan (#9)
     skip_gps_images= bool(scan_opts.get("skip_gps_images", False))
     min_cpr_count  = max(1, int(scan_opts.get("min_cpr_count", 1)))
+    scan_emails    = bool(scan_opts.get("scan_emails",  False))
+    scan_phones    = bool(scan_opts.get("scan_phones",  False))
 
     # Delta token state — loaded once, updated per-source, saved on completion
     delta_tokens:     dict = _load_delta_tokens() if delta_enabled else {}
@@ -490,6 +500,8 @@ def run_scan(options: dict):
             "source":       item_meta.get("_source", ""),
             "source_type":  item_meta.get("_source_type", ""),
             "cpr_count":    len(cprs),
+            "email_count":  item_meta.get("_email_count", 0),
+            "phone_count":  item_meta.get("_phone_count", 0),
             "url":          item_meta.get("webUrl", "") or item_meta.get("_url", ""),
             "size_kb":      round(item_meta.get("size", 0) / 1024, 1),
             "modified":     (item_meta.get("lastModifiedDateTime") or item_meta.get("receivedDateTime") or "")[:10],
@@ -1056,12 +1068,18 @@ def run_scan(options: dict):
 
                 # Scan body — use pre-extracted text (body HTML was stripped at
                 # collection time to keep work_items memory footprint small)
-                all_cprs = []
-                body_text = ""
+                all_cprs   = []
+                all_emails = []
+                all_phones = []
+                body_text  = ""
                 if scan_email_body:
-                    body_text = meta.pop("_precomputed_body", "")
+                    body_text   = meta.pop("_precomputed_body", "")
                     body_result = _scan_text_direct(body_text)
-                    all_cprs = list(body_result.get("cprs", []))
+                    all_cprs    = list(body_result.get("cprs", []))
+                    if scan_emails:
+                        all_emails = list(body_result.get("emails", []))
+                    if scan_phones:
+                        all_phones = list(body_result.get("phones", []))
 
                 # <span data-i18n="m365_opt_attachments" data-i18n="m365_opt_attachments">Scan attachments</span>
                 uid = meta.get("_account_id", "me")
@@ -1084,14 +1102,22 @@ def run_scan(options: dict):
                             att_result = _scan_bytes(att_bytes, att_name)
                             att_cprs   = att_result.get("cprs", [])
                             all_cprs.extend(att_cprs)
+                            if scan_emails:
+                                all_emails.extend(att_result.get("emails", []))
+                            if scan_phones:
+                                all_phones.extend(att_result.get("phones", []))
                             att_results.append({"name": att_name, "cpr_count": len(att_cprs)})
                         except Exception as att_err:
                             broadcast("scan_error", {"file": att_name, "error": str(att_err)})
 
-                if all_cprs:
+                _distinct_emails = list(dict.fromkeys(e["formatted"] for e in all_emails))
+                _distinct_phones = list(dict.fromkeys(p["formatted"] for p in all_phones))
+                if all_cprs or _distinct_emails or _distinct_phones:
                     meta["_thumb"]         = _placeholder_svg(".eml", subject)
                     meta["_thumb_is_jpeg"] = False
                     meta["_attachments"]   = att_results
+                    meta["_email_count"]   = len(_distinct_emails)
+                    meta["_phone_count"]   = len(_distinct_phones)
                     _email_pii = _get_pii_counts(body_text) if scan_email_body else {}
                     meta["_transfer_risk"]    = _check_transfer_risk(meta)
                     meta["_special_category"] = _check_special_category(
@@ -1121,10 +1147,12 @@ def run_scan(options: dict):
                 else:
                     content = conn.download_item(meta)
 
-                # CPR scan — skip for video and audio (metadata-only; no text layer)
+                # CPR/email/phone scan — skip for video and audio (metadata-only; no text layer)
                 _media_only = ext in VIDEO_EXTS or ext in AUDIO_EXTS
-                result = {"cprs": [], "dates": []} if _media_only else _scan_bytes(content, name)
+                result = {"cprs": [], "dates": [], "emails": [], "phones": []} if _media_only else _scan_bytes(content, name)
                 cprs   = result.get("cprs", [])
+                emails = result.get("emails", []) if scan_emails else []
+                phones = result.get("phones", []) if scan_phones else []
 
                 # ── Biometric photo scan (#9) + EXIF/video/audio metadata (#18) ─
                 _face_count = 0
@@ -1141,12 +1169,14 @@ def run_scan(options: dict):
                 # Apply filters: distinct CPR threshold and GPS suppression
                 _distinct_cprs   = list(dict.fromkeys(c["formatted"] for c in cprs))
                 _cpr_qualifies   = len(_distinct_cprs) >= min_cpr_count
+                _distinct_emails = list(dict.fromkeys(e["formatted"] for e in emails))
+                _distinct_phones = list(dict.fromkeys(p["formatted"] for p in phones))
                 _exif_has_pii    = _exif.get("has_pii") and (
                     not skip_gps_images or bool(_exif.get("pii_fields") or _exif.get("author"))
                 )
 
-                # Flag item if CPRs found (above threshold), faces detected, or EXIF PII found
-                if (_cpr_qualifies and cprs) or _face_count > 0 or _exif_has_pii:
+                # Flag item if CPRs/emails/phones found, faces detected, or EXIF PII found
+                if (_cpr_qualifies and cprs) or _distinct_emails or _distinct_phones or _face_count > 0 or _exif_has_pii:
                     # Make thumbnail
                     if ext in {".jpg", ".jpeg", ".png"} and PIL_OK:
                         thumb = _make_thumb(content, name)
@@ -1182,6 +1212,8 @@ def run_scan(options: dict):
                     meta["_special_category"] = _sc
                     meta["_face_count"]        = _face_count
                     meta["_exif"]              = _exif
+                    meta["_email_count"]       = len(_distinct_emails)
+                    meta["_phone_count"]       = len(_distinct_phones)
                     _broadcast_card(meta, cprs, pii_counts=_file_pii)
                 else:
                     del content  # no hits — free raw bytes immediately
diff --git a/static/js/profiles.js b/static/js/profiles.js
index 873311a..9db357c 100644
--- a/static/js/profiles.js
+++ b/static/js/profiles.js
@@ -137,6 +137,16 @@ function _applyProfile(profile) {
     if (el) el.value = opts.min_cpr_count;
   }
 
+  if (opts.scan_emails !== undefined) {
+    const el = document.getElementById('optScanEmails');
+    if (el) el.checked = opts.scan_emails;
+  }
+
+  if (opts.scan_phones !== undefined) {
+    const el = document.getElementById('optScanPhones');
+    if (el) el.checked = opts.scan_phones;
+  }
+
   // ── Date filter ───────────────────────────────────────────────────────────
   const days = opts.older_than_days;
   if (days !== undefined) {
@@ -417,6 +427,8 @@ function _openEditorForProfile(profile) {
           <div class="pmgmt-opt-row"><span>${t('m365_opt_scan_photos','Søg efter ansigter i billeder')}</span><label class="toggle"><input type="checkbox" id="peOptPhotos" ${opts.scan_photos ? 'checked' : ''}><span class="toggle-slider"></span></label></div>
           <div class="pmgmt-opt-row"><span>${t('m365_opt_skip_gps','Ignorer GPS i billeder')}</span><label class="toggle"><input type="checkbox" id="peOptSkipGps" ${opts.skip_gps_images ? 'checked' : ''}><span class="toggle-slider"></span></label></div>
           <div class="pmgmt-opt-row"><span style="color:var(--muted)">${t('m365_opt_min_cpr','Min. CPR-antal pr. fil')}</span><input type="number" id="peOptMinCpr" value="${opts.min_cpr_count || 1}" min="1" max="50" style="width:46px;padding:3px 6px;font-size:11px;text-align:right"></div>
+          <div class="pmgmt-opt-row"><span>${t('m365_opt_scan_emails','Søg efter e-mailadresser')}</span><label class="toggle"><input type="checkbox" id="peOptEmails" ${opts.scan_emails ? 'checked' : ''}><span class="toggle-slider"></span></label></div>
+          <div class="pmgmt-opt-row"><span>${t('m365_opt_scan_phones','Søg efter telefonnumre')}</span><label class="toggle"><input type="checkbox" id="peOptPhones" ${opts.scan_phones ? 'checked' : ''}><span class="toggle-slider"></span></label></div>
           <hr style="border:none;border-top:1px solid var(--pmgmt-divider);margin:2px 0">
           <div class="pmgmt-opt-row"><span>${t('m365_opt_retention','Opbevaringspolitik')}</span><label class="toggle"><input type="checkbox" id="peOptRetention" ${profile.retention_years ? 'checked' : ''}><span class="toggle-slider"></span></label></div>
           <div style="padding:7px 8px;background:var(--bg);border-radius:6px">
@@ -633,6 +645,8 @@ async function _pmgmtSaveFullEdit() {
       scan_photos:     document.getElementById('peOptPhotos')?.checked ?? false,
       skip_gps_images: document.getElementById('peOptSkipGps')?.checked ?? false,
       min_cpr_count:   parseInt(document.getElementById('peOptMinCpr')?.value) || 1,
+      scan_emails:     document.getElementById('peOptEmails')?.checked ?? false,
+      scan_phones:     document.getElementById('peOptPhones')?.checked ?? false,
     },
     retention_years:  document.getElementById('peOptRetention')?.checked ? (parseInt(document.getElementById('peOptRetYears')?.value) || 5) : null,
     fiscal_year_end:  document.getElementById('peOptRetention')?.checked ? (document.getElementById('peOptFiscalYearEnd')?.value || '') : '',
diff --git a/static/js/results.js b/static/js/results.js
index 11e9dc9..eadb520 100644
--- a/static/js/results.js
+++ b/static/js/results.js
@@ -46,6 +46,8 @@ function appendCard(f) {
         <div class="card-source"><span class="source-badge ${badgeCls}">${label}</span> ${f.source || ''}${f.account_name ? ' · <span class="account-pill" title="' + f.account_name + '">' + (f.user_role === 'student' ? '<span class="role-badge">' + t('role_student','Elev') + '</span>' : f.user_role === 'staff' ? '<span class="role-badge">' + t('role_staff','Ansat') + '</span>' : '') + f.account_name + '</span>' : ''}${f.transfer_risk === 'external-recipient' ? ' <span class="role-pill" style="background:#7B2D00;color:#FFD0B0">⚠ Ext.</span>' : f.transfer_risk ? ' <span class="role-pill" style="background:#003D7B;color:#B0D4FF">🔗</span>' : ''}</div>
       </div>
       <span class="cpr-badge">${f.cpr_count} CPR</span>
+      ${f.email_count > 0 ? '<span class="email-badge">' + f.email_count + ' ' + t('m365_badge_emails', 'e-mail') + '</span> ' : ''}
+      ${f.phone_count > 0 ? '<span class="phone-badge">' + f.phone_count + ' ' + t('m365_badge_phones', 'tlf.') + '</span> ' : ''}
       ${f.face_count > 0 ? '<span class="photo-face-badge">' + f.face_count + ' ' + t('m365_badge_faces', f.face_count === 1 ? 'face' : 'faces') + '</span> ' : ''}
       ${f.exif && f.exif.gps ? '<span class="photo-face-badge" style="background:#0a3a5a;color:#7ec8d0">🌍 GPS</span> ' : ''}
       ${f.special_category && f.special_category.length ? '<span class="special-cat-badge">⚠ Art.9 — ' + f.special_category.filter(function(s){return s !== 'gps_location' && s !== 'exif_pii';}).join(', ') + '</span> ' : ''}${f.overdue ? '<span class="overdue-badge">🗓 Overdue</span>' : ''}
@@ -58,7 +60,7 @@ function appendCard(f) {
         <div class="card-meta">${f.size_kb} KB · ${f.modified || ''}</div>
         ${f.folder ? `<div class="card-meta" style="font-size:10px" title="${f.folder}">📂 ${f.folder}</div>` : ''}
         <div class="card-source"><span class="source-badge ${badgeCls}">${label}</span>${f.account_name ? ' <span class="account-pill" title="' + f.account_name + '">' + (f.user_role === "student" ? '<span class="role-badge">' + t("role_student","Elev") + "</span>" : f.user_role === "staff" ? '<span class="role-badge">' + t("role_staff","Ansat") + "</span>" : "") + f.account_name + '</span>' : ''}${f.transfer_risk === "external-recipient" ? ' <span class="role-pill" style="background:#7B2D00;color:#FFD0B0">⚠ Ext.</span>' : f.transfer_risk ? ' <span class="role-pill" style="background:#003D7B;color:#B0D4FF">🔗</span>' : ''}</div>
-        <span class="cpr-badge">${f.cpr_count} CPR</span>${f.face_count > 0 ? ' <span class="photo-face-badge">' + f.face_count + ' ' + t('m365_badge_faces', f.face_count === 1 ? 'face' : 'faces') + '</span>' : ''}${f.exif && f.exif.gps ? ' <span class="photo-face-badge" style="background:#0a3a5a;color:#7ec8d0">🌍 GPS</span>' : ''}${f.overdue ? ' <span class="overdue-badge">🗓 Overdue</span>' : ''}
+        <span class="cpr-badge">${f.cpr_count} CPR</span>${f.email_count > 0 ? ' <span class="email-badge">' + f.email_count + ' ' + t('m365_badge_emails', 'e-mail') + '</span>' : ''}${f.phone_count > 0 ? ' <span class="phone-badge">' + f.phone_count + ' ' + t('m365_badge_phones', 'tlf.') + '</span>' : ''}${f.face_count > 0 ? ' <span class="photo-face-badge">' + f.face_count + ' ' + t('m365_badge_faces', f.face_count === 1 ? 'face' : 'faces') + '</span>' : ''}${f.exif && f.exif.gps ? ' <span class="photo-face-badge" style="background:#0a3a5a;color:#7ec8d0">🌍 GPS</span>' : ''}${f.overdue ? ' <span class="overdue-badge">🗓 Overdue</span>' : ''}
       </div>
       ${delBtn}`;
   }
@@ -101,7 +103,9 @@ async function openPreview(f) {
     f.source   ? `<span>${f.source}</span>` : '',
     f.size_kb  ? `<span>${f.size_kb} KB</span>` : '',
     f.modified ? `<span>${f.modified}</span>` : '',
-    f.cpr_count ? `<span style="color:var(--danger)">${f.cpr_count} CPR</span>` : '',
+    f.cpr_count   ? `<span style="color:var(--danger)">${f.cpr_count} CPR</span>` : '',
+    f.email_count ? `<span style="color:#7ec8f0">${f.email_count} ${t('m365_badge_emails','e-mail')}</span>` : '',
+    f.phone_count ? `<span style="color:#7eeac0">${f.phone_count} ${t('m365_badge_phones','tlf.')}</span>` : '',
     f.url ? `<button class="preview-open-btn" onclick="window.open('${f.url}','_blank')">${t("m365_preview_open","Open in M365 ↗")}</button>` : '',
   ].filter(Boolean).join('');
 
diff --git a/static/js/scan.js b/static/js/scan.js
index deb0411..092ada3 100644
--- a/static/js/scan.js
+++ b/static/js/scan.js
@@ -127,6 +127,8 @@ function buildScanPayload() {
     scan_photos:      document.getElementById('optScanPhotos') ? document.getElementById('optScanPhotos').checked : false,
     skip_gps_images:  document.getElementById('optSkipGps') ? document.getElementById('optSkipGps').checked : false,
     min_cpr_count:    document.getElementById('optMinCpr') ? (parseInt(document.getElementById('optMinCpr').value) || 1) : 1,
+    scan_emails:      document.getElementById('optScanEmails') ? document.getElementById('optScanEmails').checked : false,
+    scan_phones:      document.getElementById('optScanPhones') ? document.getElementById('optScanPhones').checked : false,
     retention_enabled: document.getElementById('optRetention') ? document.getElementById('optRetention').checked : false,
     retention_years:  parseInt(document.getElementById('optRetentionYears')?.value) || 5,
     fiscal_year_end:  document.getElementById('optFiscalYearEnd')?.value || '',
@@ -588,6 +590,8 @@ function startScan(resume) {
           scan_photos:      options.scan_photos     || false,
           skip_gps_images:  options.skip_gps_images || false,
           min_cpr_count:    options.min_cpr_count   || 1,
+          scan_emails:      options.scan_emails      || false,
+          scan_phones:      options.scan_phones      || false,
         }))
       }).catch(e => { log('File scan error: ' + e, 'err'); });
     });
diff --git a/static/style.css b/static/style.css
index 9db62fd..17578b3 100644
--- a/static/style.css
+++ b/static/style.css
@@ -491,6 +491,12 @@
   .overdue-badge { font-size: 9px; padding: 1px 5px; border-radius: 10px;
     background: #7c3200; color: #ffb347; font-weight: 600; white-space: nowrap; }
   [data-theme="light"] .overdue-badge { background: #fff3e0; color: #c55a00; }
+  .email-badge { font-size: 9px; padding: 1px 5px; border-radius: 10px;
+    background: #1a3a5c; color: #7ec8f0; font-weight: 500; white-space: nowrap; }
+  [data-theme="light"] .email-badge { background: #d0eaff; color: #004a80; }
+  .phone-badge { font-size: 9px; padding: 1px 5px; border-radius: 10px;
+    background: #1a4030; color: #7eeac0; font-weight: 500; white-space: nowrap; }
+  [data-theme="light"] .phone-badge { background: #d0f5ea; color: #005a3a; }
   .badge-email { background: rgba(139,68,173,.2); color: #b87fd8; }
   .badge-onedrive { background: rgba(0,120,212,.2); color: #5ba4e8; }
   .badge-sharepoint { background: rgba(0,160,100,.2); color: #2ecc71; }
diff --git a/templates/index.html b/templates/index.html
index 444be16..87fa578 100644
--- a/templates/index.html
+++ b/templates/index.html
@@ -137,6 +137,22 @@ document.addEventListener('DOMContentLoaded', applyI18n);
                  style="width:46px;padding:3px 6px;font-size:11px;text-align:right">
         </div>
 
+        <!-- Scan for email addresses -->
+        <div class="toggle-row">
+          <span class="toggle-label" style="flex:1">
+            <span data-i18n="m365_opt_scan_emails">Scan for email addresses</span><span class="hint-wrap"><span class="hint-icon" onclick="toggleHint(this)">?</span><span class="hint-bubble" data-i18n="m365_opt_scan_emails_hint">Flags files that contain email addresses. Off by default — email addresses are very common and may produce many results.</span></span>
+          </span>
+          <label class="toggle"><input type="checkbox" id="optScanEmails"><span class="toggle-slider"></span></label>
+        </div>
+
+        <!-- Scan for phone numbers -->
+        <div class="toggle-row">
+          <span class="toggle-label" style="flex:1">
+            <span data-i18n="m365_opt_scan_phones">Scan for phone numbers</span><span class="hint-wrap"><span class="hint-icon" onclick="toggleHint(this)">?</span><span class="hint-bubble" data-i18n="m365_opt_scan_phones_hint">Flags files containing Danish phone numbers (8 digits). Useful for finding contact lists and parent correspondence.</span></span>
+          </span>
+          <label class="toggle"><input type="checkbox" id="optScanPhones"><span class="toggle-slider"></span></label>
+        </div>
+
         <!-- Retention policy (suggestion #1) -->
         <div class="toggle-row">
           <span class="toggle-label" style="flex:1">