From c820d6f6dbbed8c96e39e5e755dcc5c31ce1234e Mon Sep 17 00:00:00 2001 From: StyxX65 <150797939+StyxX65@users.noreply.github.com> Date: Thu, 28 May 2026 10:20:22 +0200 Subject: [PATCH] =?UTF-8?q?Two=20bugs=20in=20the=20abort=20mechanism:=20?= =?UTF-8?q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?UTF-8?q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?UTF-8?q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?UTF-8?q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?UTF-8?q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?UTF-8?q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?UTF-8?q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?UTF-8?q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?UTF-8?q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?UTF-8?q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?UTF-8?q?=20=20=20=20=20=20=20=20=201.=20POST=20/api/scan/stop=20only=20?= =?UTF-8?q?set=20state.=5Fscan=5Fabort=20(M365/file=20abort=20event)=20=20?= =?UTF-8?q?=20=20=20=20=20but=20never=20touched=20state.=5Fgoogle=5Fscan?= =?UTF-8?q?=5Fabort.=20Now=20sets=20both.=20=20=20=20=202.=20=5Fcheck=5Fab?= =?UTF-8?q?ort()=20inside=20=5Frun=5Fgoogle=5Fscan=20imported=20gdpr=5Fsca?= =?UTF-8?q?nner.=5Fscan=5Fabort=20=20=20=20=20=20=20(=3D=20state.=5Fscan?= =?UTF-8?q?=5Fabort,=20the=20M365=20event)=20instead=20of=20using=20the=20?= =?UTF-8?q?module-level=20=20=20=20=20=20=20=5Fscan=5Fabort=20alias=20(=3D?= =?UTF-8?q?=20state.=5Fgoogle=5Fscan=5Fabort).=20This=20meant=20the=20dedi?= =?UTF-8?q?cated=20=20=20=20=20=20=20/api/google/scan/cancel=20endpoint=20?= =?UTF-8?q?=E2=80=94=20which=20correctly=20sets=20=5Fgoogle=5Fscan=5Fabort?= =?UTF-8?q?=20=20=20=20=20=20=20=E2=80=94=20was=20silently=20ignored=20by?= =?UTF-8?q?=20the=20scan=20loop.=20Fixed=20to=20use=20the=20module-level?= =?UTF-8?q?=20=20=20=20=20=20=20alias=20consistently.=20Also=20aligned=20t?= =?UTF-8?q?he=20end-of-scan=20checkpoint-clear=20check.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 2 ++ lang/da.json | 2 ++ lang/de.json | 2 ++ lang/en.json | 2 ++ scan_engine.py | 7 ++++--- static/js/profiles.js | 7 +++++++ static/js/scan.js | 1 + templates/index.html | 8 ++++++++ 8 files changed, 28 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a99242d..f1360a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ Version numbers follow [Semantic Versioning](https://semver.org/spec/v2.0.0.html ### Added +- **CPR-only mode** — a new `cpr_only` scan option (sidebar toggle `#optCprOnly`, profile editor `#peOptCprOnly`) makes all three scan engines skip items that have no qualifying CPR numbers. Files whose only hits are email addresses, phone numbers, detected faces, or EXIF/GPS metadata are not flagged. The flag already detected is still shown on cards when `cpr_only=false` (default). Gated in all three engines: file scan skip condition, M365 email flagging, M365 file flagging, and Google Gmail/Drive flagging. + - **OCR language override** — a new `ocr_lang` scan option (sidebar select `#optOcrLang`, profile editor `#peOptOcrLang`) lets operators choose the Tesseract language pack(s) used when scanning scanned PDFs and images. Presets: `dan+eng` (default), `dan`, `eng`, `dan+eng+deu`, `dan+eng+swe`, `dan+eng+fra`. The setting flows from the UI through the profile, into all three scan engines (M365 `_scan_bytes_timeout`, M365 attachments `_scan_bytes`, M365 files `_scan_bytes`, Google `_scan_bytes` for both Gmail and Drive). The `lang` parameter is threaded through `cpr_detector._scan_bytes` → `document_scanner.scan_pdf` / `scan_image` and the spawned PDF-OCR subprocess worker. The OCR cache key already included `lang`, so per-language results are cached independently. - **Built-in file redaction for local files** — a scissor button (`✂`) appears on cards for local DOCX, XLSX, CSV, and TXT files. Clicking it rewrites the file in-place with all detected CPR numbers replaced by `██████-████` (DOCX/XLSX) or `█`-blocks (CSV/TXT), then removes the card from the grid and logs a `"redacted"` disposition. The redaction is atomic: a temp file in the same directory is written first and then moved over the original, so a crash never leaves a half-written file. Implemented in `routes/export.py` (`POST /api/redact_item`) using the existing `document_scanner` redact functions; front-end in `results.js` (`redactItem`) with the button hidden for non-local or unsupported-extension items and for resolved/viewer-mode cards. diff --git a/lang/da.json b/lang/da.json index 0b95afe..6faf48d 100644 --- a/lang/da.json +++ b/lang/da.json @@ -577,6 +577,8 @@ "m365_badge_emails": "e-mail", "m365_badge_phones": "tlf.", "m365_opt_min_cpr_hint": "Filer med færre distinkte CPR-numre end denne tærskel rapporteres ikke. Sæt til 2 for at undgå falske positive, når elever har egne CPR-numre i filer.", + "m365_opt_cpr_only": "Kun CPR-tilstand", + "m365_opt_cpr_only_hint": "Flagger kun filer med CPR-numre. Filer med kun e-mailadresser, telefonnumre, ansigter eller EXIF-metadata ignoreres.", "m365_opt_ocr_lang": "OCR-sprog", "m365_opt_ocr_lang_hint": "Tesseract-sprogpakke(r) der bruges ved scanning af scannede PDF'er og billeder. Sprogpakker skal være installeret på serveren (f.eks. tesseract-ocr-dan). Flere pakker: dan+eng.", "m365_filter_photo_only": "📷 Billeder / biometrisk", diff --git a/lang/de.json b/lang/de.json index 81b786c..6056214 100644 --- a/lang/de.json +++ b/lang/de.json @@ -577,6 +577,8 @@ "m365_badge_emails": "E-Mail", "m365_badge_phones": "Tel.", "m365_opt_min_cpr_hint": "Dateien mit weniger eindeutigen CPR-Nummern als dieser Schwellenwert werden nicht gemeldet. Auf 2 setzen, um Falsch-Positive zu vermeiden, wenn Schüler eigene CPR-Nummern in Dateien haben.", + "m365_opt_cpr_only": "Nur-CPR-Modus", + "m365_opt_cpr_only_hint": "Markiert nur Dateien mit CPR-Nummern. Dateien mit nur E-Mail-Adressen, Telefonnummern, Gesichtern oder EXIF-Metadaten werden ignoriert.", "m365_opt_ocr_lang": "OCR-Sprache", "m365_opt_ocr_lang_hint": "Tesseract-Sprachpaket(e) für das Scannen von gescannten PDFs und Bildern. Pakete müssen auf dem Server installiert sein (z.B. tesseract-ocr-dan). Mehrere Pakete: dan+eng.", "m365_filter_photo_only": "📷 Fotos / biometrisch", diff --git a/lang/en.json b/lang/en.json index de19caa..9416b20 100644 --- a/lang/en.json +++ b/lang/en.json @@ -577,6 +577,8 @@ "m365_badge_emails": "email", "m365_badge_phones": "phone", "m365_opt_min_cpr_hint": "Files with fewer distinct CPR numbers than this threshold are not reported. Set to 2 to avoid false positives when students have their own CPR in documents.", + "m365_opt_cpr_only": "CPR-only mode", + "m365_opt_cpr_only_hint": "Only flag files that contain CPR numbers. Files with only email addresses, phone numbers, detected faces, or EXIF metadata are skipped.", "m365_opt_ocr_lang": "OCR language", "m365_opt_ocr_lang_hint": "Tesseract language pack(s) used when scanning scanned PDFs and images. Language packs must be installed on the server (e.g. tesseract-ocr-dan). Multiple packs: dan+eng.", "m365_filter_photo_only": "📷 Photos / biometric", diff --git a/scan_engine.py b/scan_engine.py index 725906b..af5cf76 100644 --- a/scan_engine.py +++ b/scan_engine.py @@ -316,7 +316,7 @@ def run_file_scan(source: dict): not skip_gps_images or bool(_exif.get("pii_fields") or _exif.get("author")) ) - if not (_cpr_qualifies and cprs) and not _distinct_emails and not _distinct_phones and _face_count == 0 and not _exif_has_pii: + if not (_cpr_qualifies and cprs) and (cpr_only or (not _distinct_emails and not _distinct_phones and _face_count == 0 and not _exif_has_pii)): continue # Build card metadata @@ -477,6 +477,7 @@ def run_scan(options: dict): skip_gps_images= bool(scan_opts.get("skip_gps_images", False)) min_cpr_count = max(1, int(scan_opts.get("min_cpr_count", 1))) ocr_lang = str(scan_opts.get("ocr_lang", "dan+eng")) or "dan+eng" + cpr_only = bool(scan_opts.get("cpr_only", False)) scan_emails = bool(scan_opts.get("scan_emails", False)) scan_phones = bool(scan_opts.get("scan_phones", False)) @@ -1145,7 +1146,7 @@ def run_scan(options: dict): _distinct_emails = list(dict.fromkeys(e["formatted"] for e in all_emails)) _distinct_phones = list(dict.fromkeys(p["formatted"] for p in all_phones)) - if all_cprs or _distinct_emails or _distinct_phones: + if all_cprs or (not cpr_only and (_distinct_emails or _distinct_phones)): meta["_thumb"] = _placeholder_svg(".eml", subject) meta["_thumb_is_jpeg"] = False meta["_attachments"] = att_results @@ -1211,7 +1212,7 @@ def run_scan(options: dict): ) # Flag item if CPRs/emails/phones found, faces detected, or EXIF PII found - if (_cpr_qualifies and cprs) or _distinct_emails or _distinct_phones or _face_count > 0 or _exif_has_pii: + if (_cpr_qualifies and cprs) or (not cpr_only and (_distinct_emails or _distinct_phones or _face_count > 0 or _exif_has_pii)): # Make thumbnail if ext in {".jpg", ".jpeg", ".png"} and PIL_OK: thumb = _make_thumb(content, name) diff --git a/static/js/profiles.js b/static/js/profiles.js index 91c18a9..18f9cff 100644 --- a/static/js/profiles.js +++ b/static/js/profiles.js @@ -142,6 +142,11 @@ function _applyProfile(profile) { if (el) el.value = opts.ocr_lang; } + if (opts.cpr_only !== undefined) { + const el = document.getElementById('optCprOnly'); + if (el) el.checked = opts.cpr_only; + } + if (opts.scan_emails !== undefined) { const el = document.getElementById('optScanEmails'); if (el) el.checked = opts.scan_emails; @@ -432,6 +437,7 @@ function _openEditorForProfile(profile) {