From 2254e00481d16774d50e348c18dc241bd020915a Mon Sep 17 00:00:00 2001 From: StyxX65 <150797939+StyxX65@users.noreply.github.com> Date: Sat, 25 Apr 2026 19:33:28 +0200 Subject: [PATCH] =?UTF-8?q?recap:=20Added=20email=20and=20phone=20number?= =?UTF-8?q?=20detection=20as=20opt-in=20scan=20options=20across=20all=20th?= =?UTF-8?q?ree=20engines,=20plus=20translation=20=20=20=20=20=20=20=20fixe?= =?UTF-8?q?s.=20Both=20CHANGELOG=20and=20SUGGESTIONS=20are=20updated=20?= =?UTF-8?q?=E2=80=94=20everything=20is=20committed=20and=20ready=20to=20te?= =?UTF-8?q?st.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 2 + SUGGESTIONS.md | 11 ++++ cpr_detector.py | 130 +++++++++++++++++++++++++++++++++++------- gdpr_db.py | 9 ++- lang/da.json | 6 ++ lang/de.json | 6 ++ lang/en.json | 6 ++ routes/google_scan.py | 18 +++++- scan_engine.py | 60 ++++++++++++++----- static/js/profiles.js | 14 +++++ static/js/results.js | 8 ++- static/js/scan.js | 4 ++ static/style.css | 6 ++ templates/index.html | 16 ++++++ 14 files changed, 254 insertions(+), 42 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 864696e..12b0a89 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ Version numbers follow [Semantic Versioning](https://semver.org/spec/v2.0.0.html ### Added +- **Email address and Danish phone number detection** — all three scan engines (M365, Google Workspace, local/SMB/SFTP) can now flag files and messages containing email addresses or Danish phone numbers in addition to CPR numbers. Detection is opt-in per profile: two new toggle options **Scan for email addresses** and **Scan for phone numbers** (default off) appear in the scan options panel and profile editor. When enabled, matches are stored as `email_count` / `phone_count` on each DB row and surfaced as colour-coded badges in list view, grid view, and the preview panel. Email regex requires a structurally valid address (`local@domain.tld`); phone regex covers 8-digit Danish numbers with optional `+45`/`0045` prefix and common spacing patterns. Both are deduplicated before counting. Requires DB migration (adds two INTEGER columns to `flagged_items`; applied automatically on first startup via `_MIGRATIONS`). + - **SFTP as a 4th file connector** — SFTP servers can now be added as file sources alongside local folders, SMB shares, and cloud sources. A new `SFTPScanner` class in `sftp_connector.py` implements the same `iter_files()` interface as `FileScanner`, so `run_file_scan()`, SSE broadcasting, DB persistence, card building, scheduled scans, and exports work without changes. Supports password auth and SSH private key auth (RSA, Ed25519, ECDSA, DSS); passphrases stored in the OS keychain. Key files uploaded via `POST /api/file_sources/upload_key` and stored in `~/.gdprscanner/sftp_keys/` with `chmod 600`. SFTP sources appear with a 🔒 icon in the sources panel. Requires `paramiko>=3.4` (optional — scanner falls back gracefully if not installed). New source-type selector (Local / Network (SMB) / SFTP) replaces the SMB path-prefix auto-detection in the add-source form. - **`POST /api/file_sources/upload_key`** — new endpoint that validates and stores an SSH private key file, returning a `key_path` for use in the source definition. diff --git a/SUGGESTIONS.md b/SUGGESTIONS.md index 5e078d4..98a6ad5 100644 --- a/SUGGESTIONS.md +++ b/SUGGESTIONS.md @@ -350,3 +350,14 @@ Write redacted copies of flagged files with CPR numbers replaced by `XXX XXXX-XX ### Email notification on scan completion (non-scheduled) ✅ Auto-email now fires on manual scans when **Email report after manual scan** is enabled in Settings → Email report. Toggle stored as `auto_email_manual` in `smtp.json`. Implemented in `routes/scan.py` — `_maybe_send_auto_email()` is called from the `_run()` thread after `run_scan()` returns. Same Graph-first → SMTP-fallback pattern as scheduled scans. Only fires when there are flagged items and at least one recipient is configured. + +### Phase 2 PII: name-based roster lookup + +Flag documents containing the full names of students or staff — even when no CPR is present. Implementation outline: + +1. **Roster source** — pull names from the M365 directory (`/users?$select=displayName`), the GWS directory (`admin.list_users`), or a user-uploaded CSV. Store as a flat list of `(first, last)` pairs, minimum length threshold (~5 chars per part) to suppress common first-name noise. +2. **Multi-pattern search** — build an Aho-Corasick automaton from the roster at scan start (`pyahocorasick`, ~50 KB, optional dep). Run each extracted text through the automaton; a hit qualifies only when the match falls on a word boundary and both first + last name appear within a configurable window (e.g. 100 characters apart). +3. **Integration** — same `_find_emails_phones`-style helper in `cpr_detector.py`; roster loaded once per scan run and passed as a parameter. New `name_count` column in `flagged_items` (DB migration). New `name-badge` in the UI. Opt-in profile toggle like `scan_emails`. +4. **NER fallback** — optionally run `spaCy` `da_core_news_sm` (~200 MB) when no roster is available to detect PERSON entities. Much higher false-positive rate; only useful as a discovery tool. + +**Why deferred:** requires a roster-management UI (upload CSV, choose directory source, refresh cadence), and false-positive rate depends heavily on roster quality. Name-only matches also carry lower legal weight than CPR hits. Implement after a school explicitly requests it. diff --git a/cpr_detector.py b/cpr_detector.py index 2095948..1b60645 100644 --- a/cpr_detector.py +++ b/cpr_detector.py @@ -22,6 +22,7 @@ from __future__ import annotations import base64 import hashlib import io +import re import tempfile import threading from pathlib import Path @@ -505,55 +506,139 @@ def _detect_photo_faces(content: bytes, filename: str) -> int: return 0 +_EMAIL_RE = re.compile( + r'\b[a-zA-Z0-9][a-zA-Z0-9._%+\-]*@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}\b' +) +_PHONE_RE = re.compile( + r'(?:' + r'(?:\+45|0045)[\s\-]?[2-9]\d{3}[\s\-]?\d{4}' # +45/0045 DDDD DDDD + r'|(?:\+45|0045)[\s\-]?[2-9]\d(?:[\s\-]\d{2}){3}' # +45/0045 DD DD DD DD + r'|\b[2-9]\d{7}\b' # 8 consecutive digits + r'|\b[2-9]\d{3}[\s\-]\d{4}\b' # DDDD DDDD + r'|\b[2-9]\d(?:[\s\-]\d{2}){3}\b' # DD DD DD DD + r')' +) + + +def _extract_text_from_bytes(content: bytes, filename: str) -> str: + """Extract plain text from file bytes for email/phone pattern matching. + + Returns empty string for binary media files (photos, video, audio) and + on any parse error — callers must never raise from this function. + """ + ext = Path(filename).suffix.lower() + try: + if ext in {".txt", ".csv", ".eml", ".msg"}: + return content.decode("utf-8", errors="replace") + if ext in {".docx", ".doc"}: + from docx import Document as _Doc + doc = _Doc(io.BytesIO(content)) + parts = [p.text for p in doc.paragraphs] + for tbl in doc.tables: + for row in tbl.rows: + for cell in row.cells: + parts.append(cell.text) + return "\n".join(parts) + if ext in {".xlsx", ".xlsm"}: + import openpyxl as _xl + wb = _xl.load_workbook(io.BytesIO(content), read_only=True, data_only=True) + parts = [ + str(cell.value) + for ws in wb.worksheets + for row in ws.iter_rows() + for cell in row + if cell.value is not None + ] + wb.close() + return " ".join(parts) + if ext == ".pdf": + import pdfplumber as _pp + with _pp.open(io.BytesIO(content)) as pdf: + parts = [p.extract_text() or "" for p in pdf.pages] + return "\n".join(parts) + except Exception: + pass + if ext not in PHOTO_EXTS | VIDEO_EXTS | AUDIO_EXTS: + try: + return content.decode("utf-8", errors="replace") + except Exception: + pass + return "" + + +def _find_emails_phones(text: str) -> dict: + """Extract unique email addresses and Danish phone numbers from text. + + Returns {"emails": [{"formatted": str}, ...], "phones": [{"formatted": str}, ...]}. + Phones are normalised to digit-only strings (preserving a leading '+'). + """ + if not text: + return {"emails": [], "phones": []} + emails = list(dict.fromkeys(m.group(0).lower() for m in _EMAIL_RE.finditer(text))) + phones = list(dict.fromkeys( + ('+' + re.sub(r'[\s\-]', '', m.group(0)[1:]) if m.group(0).lstrip().startswith('+') + else re.sub(r'[\s\-]', '', m.group(0))) + for m in _PHONE_RE.finditer(text) + )) + return { + "emails": [{"formatted": e} for e in emails], + "phones": [{"formatted": p} for p in phones], + } + + def _scan_bytes(content: bytes, filename: str, poppler_path=None) -> dict: - """Scan raw bytes for CPRs. Returns scanner result dict.""" + """Scan raw bytes for CPRs, emails, and phone numbers. Returns result dict.""" if not SCANNER_OK: - return {"cprs": [], "dates": [], "error": "scanner not available"} + return {"cprs": [], "dates": [], "emails": [], "phones": [], "error": "scanner not available"} ext = Path(filename).suffix.lower() with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp: tmp.write(content) tmp_path = Path(tmp.name) + result: dict = {"cprs": [], "dates": []} try: if ext == ".pdf": # Check if the PDF has a text layer before running full scan_pdf. # Image-only PDFs (scanned documents) have no text and would trigger # Tesseract OCR subprocesses that hang indefinitely on some files. try: - import pdfplumber as _pp, io as _io - with _pp.open(_io.BytesIO(content)) as _pdf: + import pdfplumber as _pp + with _pp.open(io.BytesIO(content)) as _pdf: has_text = any(ds.is_text_page(p) for p in _pdf.pages) if not has_text: - return {"cprs": [], "dates": []} # image-only PDF — no CPRs possible + return {"cprs": [], "dates": [], "emails": [], "phones": []} except Exception: pass # if pdfplumber fails, fall through to full scan_pdf - return ds.scan_pdf(tmp_path, poppler_path=poppler_path) + result = ds.scan_pdf(tmp_path, poppler_path=poppler_path) elif ext in {".docx", ".doc"}: - return ds.scan_docx(tmp_path) + result = ds.scan_docx(tmp_path) elif ext in {".xlsx", ".xlsm"}: - return ds.scan_xlsx(tmp_path) + result = ds.scan_xlsx(tmp_path) elif ext == ".csv": - return ds.scan_csv(tmp_path) + result = ds.scan_csv(tmp_path) elif ext == ".txt": text = content.decode("utf-8", errors="replace") cprs, dates = ds.extract_matches(text, 1, "text") - return {"cprs": cprs, "dates": dates} + result = {"cprs": cprs, "dates": dates} elif ext in {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp"}: - return ds.scan_image(tmp_path) + result = ds.scan_image(tmp_path) else: - # Try plain text try: text = content.decode("utf-8", errors="replace") cprs, dates = ds.extract_matches(text, 1, "text") - return {"cprs": cprs, "dates": dates} + result = {"cprs": cprs, "dates": dates} except Exception: - return {"cprs": [], "dates": []} + pass except Exception as e: - return {"cprs": [], "dates": [], "error": str(e)} + result = {"cprs": [], "dates": [], "error": str(e)} finally: try: tmp_path.unlink() except Exception: pass + ep = _find_emails_phones(_extract_text_from_bytes(content, filename)) + result["emails"] = ep["emails"] + result["phones"] = ep["phones"] + return result def _worker_scan_pdf(pdf_path_str: str, result_q) -> None: """Worker executed in a spawned subprocess — must be a module-level function.""" @@ -607,19 +692,22 @@ def _scan_bytes_timeout(content: bytes, filename: str, timeout: int = 60) -> dic def _scan_text_direct(text: str) -> dict: - """Scan a plain text string for CPRs using extract_matches. - + """Scan a plain text string for CPRs, emails, and phone numbers. + Uses ds.extract_matches() directly rather than ds.scan_text() because scan_text() calls extract_cpr_and_dates() which is not defined in document_scanner.py (pre-existing bug). """ - if not SCANNER_OK or not text: - return {"cprs": [], "dates": []} + if not text: + return {"cprs": [], "dates": [], "emails": [], "phones": []} + ep = _find_emails_phones(text) + if not SCANNER_OK: + return {"cprs": [], "dates": [], **ep} try: cprs, dates = ds.extract_matches(text, 1, "text") - return {"cprs": cprs, "dates": dates} + return {"cprs": cprs, "dates": dates, **ep} except Exception: - return {"cprs": [], "dates": []} + return {"cprs": [], "dates": [], **ep} def _html_esc(s: str) -> str: """HTML-escape a string for safe inline embedding.""" diff --git a/gdpr_db.py b/gdpr_db.py index aa4ce59..dd67545 100644 --- a/gdpr_db.py +++ b/gdpr_db.py @@ -200,6 +200,8 @@ _MIGRATIONS: list[tuple[int, str]] = [ (4, "ALTER TABLE flagged_items ADD COLUMN face_count INTEGER NOT NULL DEFAULT 0"), (5, "ALTER TABLE flagged_items ADD COLUMN exif_json TEXT NOT NULL DEFAULT '{}'"), (6, "ALTER TABLE flagged_items ADD COLUMN full_path TEXT NOT NULL DEFAULT ''"), + (8, "ALTER TABLE flagged_items ADD COLUMN email_count INTEGER NOT NULL DEFAULT 0"), + (9, "ALTER TABLE flagged_items ADD COLUMN phone_count INTEGER NOT NULL DEFAULT 0"), (7, """CREATE TABLE IF NOT EXISTS schedule_runs ( id INTEGER PRIMARY KEY AUTOINCREMENT, started_at REAL NOT NULL, @@ -311,8 +313,9 @@ class ScanDB: (id, scan_id, name, source, source_type, account_id, folder, url, drive_id, size_kb, modified, cpr_count, risk, thumb_b64, thumb_mime, attachments, user_role, transfer_risk, - special_category, face_count, exif_json, full_path, scanned_at) - VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", + special_category, face_count, exif_json, full_path, + email_count, phone_count, scanned_at) + VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", ( card.get("id", ""), scan_id, @@ -336,6 +339,8 @@ class ScanDB: card.get("face_count", 0), json.dumps(card.get("exif", {})), card.get("full_path", ""), + card.get("email_count", 0), + card.get("phone_count", 0), now, ), ) diff --git a/lang/da.json b/lang/da.json index 26d0ad3..a09e30b 100644 --- a/lang/da.json +++ b/lang/da.json @@ -570,6 +570,12 @@ "m365_opt_skip_gps": "Ignorer GPS i billeder", "m365_opt_skip_gps_hint": "Billeder med GPS-koordinater flagges ikke — nyttigt ved elevscanninger, hvor smartphones indlejrer placering i alle fotos.", "m365_opt_min_cpr": "Min. CPR-antal pr. fil", + "m365_opt_scan_emails": "Søg efter e-mailadresser", + "m365_opt_scan_emails_hint": "Flagger filer med e-mailadresser. Slået fra som standard — e-mailadresser er meget almindelige og kan give mange resultater.", + "m365_opt_scan_phones": "Søg efter telefonnumre", + "m365_opt_scan_phones_hint": "Flagger filer med danske telefonnumre (8 cifre). Nyttigt til at finde kontaktlister og forældrekorrespondance.", + "m365_badge_emails": "e-mail", + "m365_badge_phones": "tlf.", "m365_opt_min_cpr_hint": "Filer med færre distinkte CPR-numre end denne tærskel rapporteres ikke. Sæt til 2 for at undgå falske positive, når elever har egne CPR-numre i filer.", "m365_filter_photo_only": "📷 Billeder / biometrisk", "m365_filter_all_roles": "Alle roller", diff --git a/lang/de.json b/lang/de.json index f647239..3e15813 100644 --- a/lang/de.json +++ b/lang/de.json @@ -570,6 +570,12 @@ "m365_opt_skip_gps": "GPS in Bildern ignorieren", "m365_opt_skip_gps_hint": "Bilder mit GPS-Koordinaten werden nicht markiert — nützlich beim Scannen von Schüler-Konten, deren Smartphones Standort in jedes Foto einbetten.", "m365_opt_min_cpr": "Min. CPR-Anzahl pro Datei", + "m365_opt_scan_emails": "E-Mail-Adressen scannen", + "m365_opt_scan_emails_hint": "Markiert Dateien mit E-Mail-Adressen. Standardmäßig deaktiviert — E-Mail-Adressen sind sehr häufig und können viele Treffer erzeugen.", + "m365_opt_scan_phones": "Telefonnummern scannen", + "m365_opt_scan_phones_hint": "Markiert Dateien mit dänischen Telefonnummern (8 Ziffern). Nützlich zum Auffinden von Kontaktlisten.", + "m365_badge_emails": "E-Mail", + "m365_badge_phones": "Tel.", "m365_opt_min_cpr_hint": "Dateien mit weniger eindeutigen CPR-Nummern als dieser Schwellenwert werden nicht gemeldet. Auf 2 setzen, um Falsch-Positive zu vermeiden, wenn Schüler eigene CPR-Nummern in Dateien haben.", "m365_filter_photo_only": "📷 Fotos / biometrisch", "m365_filter_all_roles": "Alle Rollen", diff --git a/lang/en.json b/lang/en.json index 98e6c04..c12ae78 100644 --- a/lang/en.json +++ b/lang/en.json @@ -570,6 +570,12 @@ "m365_opt_skip_gps": "Ignore GPS in images", "m365_opt_skip_gps_hint": "Images with GPS coordinates are not flagged — useful when scanning students whose smartphones embed location in every photo.", "m365_opt_min_cpr": "Min. CPR count per file", + "m365_opt_scan_emails": "Scan for email addresses", + "m365_opt_scan_emails_hint": "Flags files that contain email addresses. Off by default — email addresses are very common and may produce many results.", + "m365_opt_scan_phones": "Scan for phone numbers", + "m365_opt_scan_phones_hint": "Flags files containing Danish phone numbers (8 digits). Useful for finding contact lists and parent correspondence.", + "m365_badge_emails": "email", + "m365_badge_phones": "phone", "m365_opt_min_cpr_hint": "Files with fewer distinct CPR numbers than this threshold are not reported. Set to 2 to avoid false positives when students have their own CPR in documents.", "m365_filter_photo_only": "📷 Photos / biometric", "m365_filter_all_roles": "All roles", diff --git a/routes/google_scan.py b/routes/google_scan.py index 0577085..80da589 100644 --- a/routes/google_scan.py +++ b/routes/google_scan.py @@ -141,6 +141,8 @@ def _run_google_scan(options: dict): scan_body = bool(scan_opts.get("scan_body", True)) scan_att = bool(scan_opts.get("scan_attachments", True)) delta_enabled = bool(scan_opts.get("delta", False)) + scan_emails = bool(scan_opts.get("scan_emails", False)) + scan_phones = bool(scan_opts.get("scan_phones", False)) from checkpoint import _load_delta_tokens, _save_delta_tokens _drive_delta_tokens: dict = _load_delta_tokens() if delta_enabled else {} @@ -212,6 +214,8 @@ def _run_google_scan(options: dict): "source": item_meta.get("_source", ""), "source_type": item_meta.get("_source_type", ""), "cpr_count": len(cprs), + "email_count": item_meta.get("_email_count", 0), + "phone_count": item_meta.get("_phone_count", 0), "url": item_meta.get("_url", ""), "size_kb": round(item_meta.get("size", 0) / 1024, 1), "modified": (item_meta.get("lastModifiedDateTime") or item_meta.get("receivedDateTime") or "")[:10], @@ -276,9 +280,13 @@ def _run_google_scan(options: dict): except Exception as e: broadcast("scan_error", {"file": meta.get("name", ""), "error": str(e)}) continue - cprs = result.get("cprs", []) + cprs = result.get("cprs", []) pii_counts = result.get("pii_counts") - if cprs or (pii_counts and any(pii_counts.values())): + _em = list(dict.fromkeys(e["formatted"] for e in result.get("emails", []))) if scan_emails else [] + _ph = list(dict.fromkeys(p["formatted"] for p in result.get("phones", []))) if scan_phones else [] + if cprs or (pii_counts and any(pii_counts.values())) or _em or _ph: + meta["_email_count"] = len(_em) + meta["_phone_count"] = len(_ph) _broadcast_card(meta, cprs, pii_counts) except GoogleError as e: broadcast("scan_error", {"file": f"Gmail/{user_email}", "error": str(e)}) @@ -336,7 +344,11 @@ def _run_google_scan(options: dict): continue cprs = result.get("cprs", []) pii_counts = result.get("pii_counts") - if cprs or (pii_counts and any(pii_counts.values())): + _em = list(dict.fromkeys(e["formatted"] for e in result.get("emails", []))) if scan_emails else [] + _ph = list(dict.fromkeys(p["formatted"] for p in result.get("phones", []))) if scan_phones else [] + if cprs or (pii_counts and any(pii_counts.values())) or _em or _ph: + meta["_email_count"] = len(_em) + meta["_phone_count"] = len(_ph) _broadcast_card(meta, cprs, pii_counts) except GoogleError as e: broadcast("scan_error", {"file": f"Drive/{user_email}", "error": str(e)}) diff --git a/scan_engine.py b/scan_engine.py index ee9c786..080b61f 100644 --- a/scan_engine.py +++ b/scan_engine.py @@ -182,6 +182,8 @@ def run_file_scan(source: dict): scan_photos = bool(source.get("scan_photos", False)) skip_gps_images = bool(source.get("skip_gps_images", False)) min_cpr_count = max(1, int(source.get("min_cpr_count", 1))) + scan_emails = bool(source.get("scan_emails", False)) + scan_phones = bool(source.get("scan_phones", False)) max_mb = int(source.get("max_file_mb", 50)) if source_kind == "sftp": @@ -268,7 +270,9 @@ def run_file_scan(source: dict): broadcast("scan_error", {"file": rel_path, "error": str(e)}) continue - cprs = result.get("cprs", []) + cprs = result.get("cprs", []) + emails = result.get("emails", []) if scan_emails else [] + phones = result.get("phones", []) if scan_phones else [] # Photo / biometric scan + EXIF/video/audio metadata extraction _face_count = 0 @@ -283,13 +287,15 @@ def run_file_scan(source: dict): _exif = _extract_audio_metadata(content, rel_path) # Apply filters: distinct CPR threshold and GPS suppression - _distinct_cprs = list(dict.fromkeys(c["formatted"] for c in cprs)) - _cpr_qualifies = len(_distinct_cprs) >= min_cpr_count - _exif_has_pii = _exif.get("has_pii") and ( + _distinct_cprs = list(dict.fromkeys(c["formatted"] for c in cprs)) + _cpr_qualifies = len(_distinct_cprs) >= min_cpr_count + _distinct_emails = list(dict.fromkeys(e["formatted"] for e in emails)) + _distinct_phones = list(dict.fromkeys(p["formatted"] for p in phones)) + _exif_has_pii = _exif.get("has_pii") and ( not skip_gps_images or bool(_exif.get("pii_fields") or _exif.get("author")) ) - if not (_cpr_qualifies and cprs) and _face_count == 0 and not _exif_has_pii: + if not (_cpr_qualifies and cprs) and not _distinct_emails and not _distinct_phones and _face_count == 0 and not _exif_has_pii: continue # Build card metadata @@ -325,6 +331,8 @@ def run_file_scan(source: dict): "source": label, "source_type": source_type, "cpr_count": len(cprs), + "email_count": len(_distinct_emails), + "phone_count": len(_distinct_phones), "url": "", "size_kb": meta["size_kb"], "modified": meta["modified"], @@ -437,6 +445,8 @@ def run_scan(options: dict): scan_photos = bool(scan_opts.get("scan_photos", False)) # biometric photo scan (#9) skip_gps_images= bool(scan_opts.get("skip_gps_images", False)) min_cpr_count = max(1, int(scan_opts.get("min_cpr_count", 1))) + scan_emails = bool(scan_opts.get("scan_emails", False)) + scan_phones = bool(scan_opts.get("scan_phones", False)) # Delta token state — loaded once, updated per-source, saved on completion delta_tokens: dict = _load_delta_tokens() if delta_enabled else {} @@ -490,6 +500,8 @@ def run_scan(options: dict): "source": item_meta.get("_source", ""), "source_type": item_meta.get("_source_type", ""), "cpr_count": len(cprs), + "email_count": item_meta.get("_email_count", 0), + "phone_count": item_meta.get("_phone_count", 0), "url": item_meta.get("webUrl", "") or item_meta.get("_url", ""), "size_kb": round(item_meta.get("size", 0) / 1024, 1), "modified": (item_meta.get("lastModifiedDateTime") or item_meta.get("receivedDateTime") or "")[:10], @@ -1056,12 +1068,18 @@ def run_scan(options: dict): # Scan body — use pre-extracted text (body HTML was stripped at # collection time to keep work_items memory footprint small) - all_cprs = [] - body_text = "" + all_cprs = [] + all_emails = [] + all_phones = [] + body_text = "" if scan_email_body: - body_text = meta.pop("_precomputed_body", "") + body_text = meta.pop("_precomputed_body", "") body_result = _scan_text_direct(body_text) - all_cprs = list(body_result.get("cprs", [])) + all_cprs = list(body_result.get("cprs", [])) + if scan_emails: + all_emails = list(body_result.get("emails", [])) + if scan_phones: + all_phones = list(body_result.get("phones", [])) # Scan attachments uid = meta.get("_account_id", "me") @@ -1084,14 +1102,22 @@ def run_scan(options: dict): att_result = _scan_bytes(att_bytes, att_name) att_cprs = att_result.get("cprs", []) all_cprs.extend(att_cprs) + if scan_emails: + all_emails.extend(att_result.get("emails", [])) + if scan_phones: + all_phones.extend(att_result.get("phones", [])) att_results.append({"name": att_name, "cpr_count": len(att_cprs)}) except Exception as att_err: broadcast("scan_error", {"file": att_name, "error": str(att_err)}) - if all_cprs: + _distinct_emails = list(dict.fromkeys(e["formatted"] for e in all_emails)) + _distinct_phones = list(dict.fromkeys(p["formatted"] for p in all_phones)) + if all_cprs or _distinct_emails or _distinct_phones: meta["_thumb"] = _placeholder_svg(".eml", subject) meta["_thumb_is_jpeg"] = False meta["_attachments"] = att_results + meta["_email_count"] = len(_distinct_emails) + meta["_phone_count"] = len(_distinct_phones) _email_pii = _get_pii_counts(body_text) if scan_email_body else {} meta["_transfer_risk"] = _check_transfer_risk(meta) meta["_special_category"] = _check_special_category( @@ -1121,10 +1147,12 @@ def run_scan(options: dict): else: content = conn.download_item(meta) - # CPR scan — skip for video and audio (metadata-only; no text layer) + # CPR/email/phone scan — skip for video and audio (metadata-only; no text layer) _media_only = ext in VIDEO_EXTS or ext in AUDIO_EXTS - result = {"cprs": [], "dates": []} if _media_only else _scan_bytes(content, name) + result = {"cprs": [], "dates": [], "emails": [], "phones": []} if _media_only else _scan_bytes(content, name) cprs = result.get("cprs", []) + emails = result.get("emails", []) if scan_emails else [] + phones = result.get("phones", []) if scan_phones else [] # ── Biometric photo scan (#9) + EXIF/video/audio metadata (#18) ─ _face_count = 0 @@ -1141,12 +1169,14 @@ def run_scan(options: dict): # Apply filters: distinct CPR threshold and GPS suppression _distinct_cprs = list(dict.fromkeys(c["formatted"] for c in cprs)) _cpr_qualifies = len(_distinct_cprs) >= min_cpr_count + _distinct_emails = list(dict.fromkeys(e["formatted"] for e in emails)) + _distinct_phones = list(dict.fromkeys(p["formatted"] for p in phones)) _exif_has_pii = _exif.get("has_pii") and ( not skip_gps_images or bool(_exif.get("pii_fields") or _exif.get("author")) ) - # Flag item if CPRs found (above threshold), faces detected, or EXIF PII found - if (_cpr_qualifies and cprs) or _face_count > 0 or _exif_has_pii: + # Flag item if CPRs/emails/phones found, faces detected, or EXIF PII found + if (_cpr_qualifies and cprs) or _distinct_emails or _distinct_phones or _face_count > 0 or _exif_has_pii: # Make thumbnail if ext in {".jpg", ".jpeg", ".png"} and PIL_OK: thumb = _make_thumb(content, name) @@ -1182,6 +1212,8 @@ def run_scan(options: dict): meta["_special_category"] = _sc meta["_face_count"] = _face_count meta["_exif"] = _exif + meta["_email_count"] = len(_distinct_emails) + meta["_phone_count"] = len(_distinct_phones) _broadcast_card(meta, cprs, pii_counts=_file_pii) else: del content # no hits — free raw bytes immediately diff --git a/static/js/profiles.js b/static/js/profiles.js index 873311a..9db357c 100644 --- a/static/js/profiles.js +++ b/static/js/profiles.js @@ -137,6 +137,16 @@ function _applyProfile(profile) { if (el) el.value = opts.min_cpr_count; } + if (opts.scan_emails !== undefined) { + const el = document.getElementById('optScanEmails'); + if (el) el.checked = opts.scan_emails; + } + + if (opts.scan_phones !== undefined) { + const el = document.getElementById('optScanPhones'); + if (el) el.checked = opts.scan_phones; + } + // ── Date filter ─────────────────────────────────────────────────────────── const days = opts.older_than_days; if (days !== undefined) { @@ -417,6 +427,8 @@ function _openEditorForProfile(profile) {