From 29d9168643ed48cd9be5b7788288852e48487459 Mon Sep 17 00:00:00 2001 From: StyxX65 <150797939+StyxX65@users.noreply.github.com> Date: Mon, 22 Jun 2026 09:51:22 +0200 Subject: [PATCH] Recover unfinished scans so their items aren't stranded MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit get_session_items / get_open_items / latest_scan_id all require finished_at IS NOT NULL, but the M365 and Google engines return early on abort (skipping finish_scan) and a process kill mid-scan (deploy, OOM, crash) never reaches it either. Result on prod: 41/42 scans had finished_at NULL, so 291 already-saved flagged items were invisible — the grid showed nothing. - finalize_orphan_scans(): finalises every finished_at-NULL scan; runs once at startup before the scheduler (nothing is scanning at boot, so any unfinished scan is dead). Recovers existing stranded items and guards against future mid-scan restarts. - run_scan: finalise the DB scan on the abort early-return too, so a stopped scan's items stay visible without waiting for a restart. Co-Authored-By: Claude Opus 4.8 --- CLAUDE.md | 1 + gdpr_db.py | 30 ++++++++++++++++++++++++++++++ gdpr_scanner.py | 13 +++++++++++++ scan_engine.py | 8 ++++++++ tests/test_db.py | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 100 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index e80e1cd..327c32b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -93,6 +93,7 @@ All options live in the profile `options` dict and apply to **all three scan eng - **`get_sessions(limit=50, window_seconds=300)`** — groups `scans` rows by 300 s window. Groups built ascending, returned descending. `ref_scan_id` is the highest `scan_id` in each group. Do not change window size independently of `get_session_items`. - **`get_session_items(ref_scan_id=N)`** — anchors 300 s window to that scan's `started_at`. Window is **symmetric**: `started_at BETWEEN ref.started_at - 300 AND ref.started_at + 300`. Do not revert to a one-sided lower bound. - **`get_related_items(item_id, ref_scan_id, window_seconds=300)`** — self-joins `cpr_index` to find items sharing ≥1 CPR hash. Uses same 300 s symmetric window — do not change independently. +- **Scans must be finalised or their items are invisible** — `get_session_items`, `get_open_items`, and `latest_scan_id` all filter on `finished_at IS NOT NULL`. The file scan finalises in a `finally`; M365 (`run_scan`) and Google (`_run_google_scan`) `return` early on abort, so each now calls `finish_scan` before that abort-return. A process kill (deploy/OOM/crash) mid-scan still strands a scan → **`finalize_orphan_scans()`** runs once at server startup (`gdpr_scanner.py` `__main__`, before the scheduler) and finalises every `finished_at IS NULL` scan (safe because nothing is scanning at boot). Do not add a scan-results query that ignores `finished_at` instead of fixing finalisation. - **`get_open_items()`** — returns every flagged item with **no action taken**, across **all** scans (not just the latest session window). "Open" = no `dispositions` row, or one whose `status='unreviewed'`. Because `flagged_items` PK is `(id, scan_id)`, the same item recurs per scan; the query dedupes by `id`, keeping the row from the highest finished `scan_id`. This powers the **default landing view** so items don't drop out of sight once a newer scan opens a fresh session. - **`GET /api/db/flagged`** — **with `?ref=N`** → `get_session_items(ref_scan_id=N)` (history mode); **without ref** → `get_open_items()` (default + viewer). Viewer scope enforcement applies to both. Do not change the no-ref `get_session_items()` default elsewhere (`export.py`, `scan_scheduler.py` still rely on latest-session for the current scan's report/email). - See `static/js/CLAUDE.md` for the frontend history browser behaviour and `sse_replay_done` retry fix. diff --git a/gdpr_db.py b/gdpr_db.py index 3eb275b..d823681 100644 --- a/gdpr_db.py +++ b/gdpr_db.py @@ -29,11 +29,14 @@ Usage (from gdpr_scanner.py) import hashlib import json +import logging import sqlite3 import time from pathlib import Path from typing import Iterator +logger = logging.getLogger(__name__) + from pathlib import Path as _P _DATA_DIR = _P.home() / ".gdprscanner" _DATA_DIR.mkdir(exist_ok=True) @@ -432,6 +435,33 @@ class ScanDB: c.commit() + def finalize_orphan_scans(self) -> int: + """Finalise scans left unfinished by a crash, kill, or mid-scan restart. + + After a fresh process start nothing is scanning, so any scan still + carrying finished_at IS NULL is dead — the process that owned it is gone. + Its already-saved flagged_items were stranded: both get_session_items + and get_open_items require finished_at, so those items are invisible and + effectively lost. Finalising the orphans on startup makes them show up + and prevents permanent data loss from interrupted scans (the M365 and + Google engines return early on abort and never reach finish_scan; only + the file scan finalises in a finally block). + + Safe to call only when no scan is running (i.e. at startup). Returns the + number of scans finalised. + """ + rows = self._connect().execute( + "SELECT id, total_scanned FROM scans WHERE finished_at IS NULL" + ).fetchall() + count = 0 + for sid, total in rows: + try: + self.finish_scan(sid, total or 0) + count += 1 + except Exception as e: + logger.warning("[db] finalize_orphan_scans: scan %s failed: %s", sid, e) + return count + # ── Query helpers ───────────────────────────────────────────────────────── def latest_scan_id(self) -> int | None: diff --git a/gdpr_scanner.py b/gdpr_scanner.py index 948d7fc..2edfcf2 100644 --- a/gdpr_scanner.py +++ b/gdpr_scanner.py @@ -2305,6 +2305,19 @@ Example --settings file with SMTP: print(f"\n GDPRScanner\n ──────────────────────────────") print(f" Open: http://{args.host}:{args.port}") + # Recover scans left unfinished by a crash / kill / mid-scan restart. + # Nothing is scanning at startup, so any scan with finished_at IS NULL is + # dead; finalising it makes its already-saved items visible again instead + # of stranding them (both get_session_items and get_open_items require a + # finished scan). Must run before the scheduler can start a new scan. + try: + if DB_OK: + _recovered = _get_db().finalize_orphan_scans() + if _recovered: + print(f" Recovered {_recovered} unfinished scan(s) from a prior restart") + except Exception as _orphan_err: + print(f" Orphan-scan recovery: failed ({_orphan_err})") + # Start in-process scheduler (#19) try: import scan_scheduler as _sched_mod diff --git a/scan_engine.py b/scan_engine.py index bdc0e48..a56a33f 100644 --- a/scan_engine.py +++ b/scan_engine.py @@ -1078,6 +1078,14 @@ def run_scan(options: dict): if _check_abort(): # Save checkpoint so scan can be resumed later _save_checkpoint(ck_key, scanned_ids, _state.flagged_items, _state.scan_meta) + # Finalise the DB scan record so items found before the stop stay + # visible — this early return otherwise skips finish_scan below, + # stranding them (invisible to get_session_items / get_open_items). + if _db and _db_scan_id: + try: + _db.finish_scan(_db_scan_id, resumed_count + idx + 1) + except Exception as _e: + logger.error("[db] finish_scan (aborted) failed: %s", _e) return idx += 1 kind, meta, _ = _work_q.popleft() # releases this item from the deque immediately diff --git a/tests/test_db.py b/tests/test_db.py index 7b7fbe5..1b62862 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -265,3 +265,51 @@ class TestExportImport: tgt.import_db(str(export_path), mode="replace") results = tgt.lookup_data_subject("290472-1234") assert len(results) >= 1 + + +# ───────────────────────────────────────────────────────────────────────────── +# Orphan-scan recovery (crash / kill / mid-scan restart) +# ───────────────────────────────────────────────────────────────────────────── + +class TestOrphanScanRecovery: + + def _start_unfinished_scan(self, db, item_id): + """Begin a scan and save an item but never call finish_scan.""" + sid = db.begin_scan({"sources": ["email"], "user_ids": []}) + db.save_item(sid, _make_card(item_id=item_id)) + return sid + + def test_unfinished_scan_items_hidden_until_recovery(self, tmp_db): + self._start_unfinished_scan(tmp_db, "orphan-1") + # Not finalised → invisible to the open-items view + assert tmp_db.get_open_items() == [] + + def test_recovery_finalises_and_reveals_items(self, tmp_db): + self._start_unfinished_scan(tmp_db, "orphan-1") + self._start_unfinished_scan(tmp_db, "orphan-2") + + recovered = tmp_db.finalize_orphan_scans() + assert recovered == 2 + + ids = {row["id"] for row in tmp_db.get_open_items()} + assert ids == {"orphan-1", "orphan-2"} + + def test_recovery_leaves_finished_scans_untouched(self, tmp_db): + sid = tmp_db.begin_scan({"sources": ["email"], "user_ids": []}) + tmp_db.save_item(sid, _make_card(item_id="done-1")) + tmp_db.finish_scan(sid, total_scanned=1) + before = tmp_db._connect().execute( + "SELECT finished_at FROM scans WHERE id=?", (sid,) + ).fetchone()[0] + + assert tmp_db.finalize_orphan_scans() == 0 # nothing to recover + + after = tmp_db._connect().execute( + "SELECT finished_at FROM scans WHERE id=?", (sid,) + ).fetchone()[0] + assert after == before # finished_at not rewritten + + def test_recovery_is_idempotent(self, tmp_db): + self._start_unfinished_scan(tmp_db, "orphan-1") + assert tmp_db.finalize_orphan_scans() == 1 + assert tmp_db.finalize_orphan_scans() == 0