diff --git a/CHANGELOG.md b/CHANGELOG.md index 871ca95..a99242d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ Version numbers follow [Semantic Versioning](https://semver.org/spec/v2.0.0.html ### Added +- **OCR language override** — a new `ocr_lang` scan option (sidebar select `#optOcrLang`, profile editor `#peOptOcrLang`) lets operators choose the Tesseract language pack(s) used when scanning scanned PDFs and images. Presets: `dan+eng` (default), `dan`, `eng`, `dan+eng+deu`, `dan+eng+swe`, `dan+eng+fra`. The setting flows from the UI through the profile, into all three scan engines (M365 `_scan_bytes_timeout`, M365 attachments `_scan_bytes`, M365 files `_scan_bytes`, Google `_scan_bytes` for both Gmail and Drive). The `lang` parameter is threaded through `cpr_detector._scan_bytes` → `document_scanner.scan_pdf` / `scan_image` and the spawned PDF-OCR subprocess worker. The OCR cache key already included `lang`, so per-language results are cached independently. + - **Built-in file redaction for local files** — a scissor button (`✂`) appears on cards for local DOCX, XLSX, CSV, and TXT files. Clicking it rewrites the file in-place with all detected CPR numbers replaced by `██████-████` (DOCX/XLSX) or `█`-blocks (CSV/TXT), then removes the card from the grid and logs a `"redacted"` disposition. The redaction is atomic: a temp file in the same directory is written first and then moved over the original, so a crash never leaves a half-written file. Implemented in `routes/export.py` (`POST /api/redact_item`) using the existing `document_scanner` redact functions; front-end in `results.js` (`redactItem`) with the button hidden for non-local or unsupported-extension items and for resolved/viewer-mode cards. - **`DELETE /api/delete_item` route registration fix** — the `delete_item` handler in `routes/export.py` was missing its `@bp.route` decorator, so the endpoint was never registered in Flask's URL map. The route now works correctly. diff --git a/cpr_detector.py b/cpr_detector.py index 1b60645..e210240 100644 --- a/cpr_detector.py +++ b/cpr_detector.py @@ -586,7 +586,7 @@ def _find_emails_phones(text: str) -> dict: } -def _scan_bytes(content: bytes, filename: str, poppler_path=None) -> dict: +def _scan_bytes(content: bytes, filename: str, poppler_path=None, lang: str = "dan+eng") -> dict: """Scan raw bytes for CPRs, emails, and phone numbers. Returns result dict.""" if not SCANNER_OK: return {"cprs": [], "dates": [], "emails": [], "phones": [], "error": "scanner not available"} @@ -608,7 +608,7 @@ def _scan_bytes(content: bytes, filename: str, poppler_path=None) -> dict: return {"cprs": [], "dates": [], "emails": [], "phones": []} except Exception: pass # if pdfplumber fails, fall through to full scan_pdf - result = ds.scan_pdf(tmp_path, poppler_path=poppler_path) + result = ds.scan_pdf(tmp_path, poppler_path=poppler_path, lang=lang) elif ext in {".docx", ".doc"}: result = ds.scan_docx(tmp_path) elif ext in {".xlsx", ".xlsm"}: @@ -620,7 +620,7 @@ def _scan_bytes(content: bytes, filename: str, poppler_path=None) -> dict: cprs, dates = ds.extract_matches(text, 1, "text") result = {"cprs": cprs, "dates": dates} elif ext in {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp"}: - result = ds.scan_image(tmp_path) + result = ds.scan_image(tmp_path, lang=lang) else: try: text = content.decode("utf-8", errors="replace") @@ -640,17 +640,17 @@ def _scan_bytes(content: bytes, filename: str, poppler_path=None) -> dict: result["phones"] = ep["phones"] return result -def _worker_scan_pdf(pdf_path_str: str, result_q) -> None: +def _worker_scan_pdf(pdf_path_str: str, result_q, lang: str = "dan+eng") -> None: """Worker executed in a spawned subprocess — must be a module-level function.""" try: import document_scanner as _ds from pathlib import Path as _Path - result_q.put(_ds.scan_pdf(_Path(pdf_path_str))) + result_q.put(_ds.scan_pdf(_Path(pdf_path_str), lang=lang)) except Exception as e: result_q.put({"cprs": [], "dates": [], "error": str(e)}) -def _scan_bytes_timeout(content: bytes, filename: str, timeout: int = 60) -> dict: +def _scan_bytes_timeout(content: bytes, filename: str, timeout: int = 60, lang: str = "dan+eng") -> dict: """Like _scan_bytes but runs PDF scanning in a spawned subprocess with a hard timeout. For non-PDF files delegates straight to _scan_bytes. For PDFs it writes the @@ -660,7 +660,7 @@ def _scan_bytes_timeout(content: bytes, filename: str, timeout: int = 60) -> dic """ ext = Path(filename).suffix.lower() if ext != ".pdf": - return _scan_bytes(content, filename) + return _scan_bytes(content, filename, lang=lang) import multiprocessing ctx = multiprocessing.get_context("spawn") @@ -673,7 +673,7 @@ def _scan_bytes_timeout(content: bytes, filename: str, timeout: int = 60) -> dic try: with _pdf_subprocess_sem: q = ctx.Queue() - p = ctx.Process(target=_worker_scan_pdf, args=(tmp_path_str, q)) + p = ctx.Process(target=_worker_scan_pdf, args=(tmp_path_str, q, lang)) p.start() p.join(timeout) if p.is_alive(): diff --git a/lang/da.json b/lang/da.json index a09e30b..0b95afe 100644 --- a/lang/da.json +++ b/lang/da.json @@ -577,6 +577,8 @@ "m365_badge_emails": "e-mail", "m365_badge_phones": "tlf.", "m365_opt_min_cpr_hint": "Filer med færre distinkte CPR-numre end denne tærskel rapporteres ikke. Sæt til 2 for at undgå falske positive, når elever har egne CPR-numre i filer.", + "m365_opt_ocr_lang": "OCR-sprog", + "m365_opt_ocr_lang_hint": "Tesseract-sprogpakke(r) der bruges ved scanning af scannede PDF'er og billeder. Sprogpakker skal være installeret på serveren (f.eks. tesseract-ocr-dan). Flere pakker: dan+eng.", "m365_filter_photo_only": "📷 Billeder / biometrisk", "m365_filter_all_roles": "Alle roller", "m365_filter_staff": "Ansatte", diff --git a/lang/de.json b/lang/de.json index 3e15813..81b786c 100644 --- a/lang/de.json +++ b/lang/de.json @@ -577,6 +577,8 @@ "m365_badge_emails": "E-Mail", "m365_badge_phones": "Tel.", "m365_opt_min_cpr_hint": "Dateien mit weniger eindeutigen CPR-Nummern als dieser Schwellenwert werden nicht gemeldet. Auf 2 setzen, um Falsch-Positive zu vermeiden, wenn Schüler eigene CPR-Nummern in Dateien haben.", + "m365_opt_ocr_lang": "OCR-Sprache", + "m365_opt_ocr_lang_hint": "Tesseract-Sprachpaket(e) für das Scannen von gescannten PDFs und Bildern. Pakete müssen auf dem Server installiert sein (z.B. tesseract-ocr-dan). Mehrere Pakete: dan+eng.", "m365_filter_photo_only": "📷 Fotos / biometrisch", "m365_filter_all_roles": "Alle Rollen", "m365_filter_staff": "Personal", diff --git a/lang/en.json b/lang/en.json index c12ae78..de19caa 100644 --- a/lang/en.json +++ b/lang/en.json @@ -577,6 +577,8 @@ "m365_badge_emails": "email", "m365_badge_phones": "phone", "m365_opt_min_cpr_hint": "Files with fewer distinct CPR numbers than this threshold are not reported. Set to 2 to avoid false positives when students have their own CPR in documents.", + "m365_opt_ocr_lang": "OCR language", + "m365_opt_ocr_lang_hint": "Tesseract language pack(s) used when scanning scanned PDFs and images. Language packs must be installed on the server (e.g. tesseract-ocr-dan). Multiple packs: dan+eng.", "m365_filter_photo_only": "📷 Photos / biometric", "m365_filter_all_roles": "All roles", "m365_filter_staff": "Staff", diff --git a/routes/google_scan.py b/routes/google_scan.py index 3c375e6..f9b051e 100644 --- a/routes/google_scan.py +++ b/routes/google_scan.py @@ -143,6 +143,7 @@ def _run_google_scan(options: dict): delta_enabled = bool(scan_opts.get("delta", False)) scan_emails = bool(scan_opts.get("scan_emails", False)) scan_phones = bool(scan_opts.get("scan_phones", False)) + ocr_lang = str(scan_opts.get("ocr_lang", "dan+eng")) or "dan+eng" from checkpoint import (_load_delta_tokens, _save_delta_tokens, _save_checkpoint, _load_checkpoint, _clear_checkpoint) @@ -314,7 +315,7 @@ def _run_google_scan(options: dict): meta["_body_excerpt"] = " ".join(_plain.split())[:500] except Exception: meta["_body_excerpt"] = "" - result = _scan_bytes(data, meta.get("name", "msg.txt")) + result = _scan_bytes(data, meta.get("name", "msg.txt"), lang=ocr_lang) except Exception as e: broadcast("scan_error", {"file": meta.get("name", ""), "error": str(e)}) _g_scanned_ids.add(_item_id) @@ -387,7 +388,7 @@ def _run_google_scan(options: dict): try: meta["_account"] = _display_name meta["_source_type"] = "gdrive" - result = _scan_bytes(data, meta.get("name", "file")) + result = _scan_bytes(data, meta.get("name", "file"), lang=ocr_lang) except Exception as e: broadcast("scan_error", {"file": meta.get("name", ""), "error": str(e)}) _g_scanned_ids.add(_item_id) diff --git a/scan_engine.py b/scan_engine.py index 8e1953a..725906b 100644 --- a/scan_engine.py +++ b/scan_engine.py @@ -110,8 +110,8 @@ AUDIO_EXTS: set = set() SUPPORTED_EXTS: set = set() # cpr_detector helpers — injected by gdpr_scanner.py -def _scan_bytes(content, filename, poppler_path=None): return {"cprs": [], "dates": []} # type: ignore[misc] -def _scan_bytes_timeout(content, filename, timeout=60): return {"cprs": [], "dates": []} # type: ignore[misc] +def _scan_bytes(content, filename, poppler_path=None, lang="dan+eng"): return {"cprs": [], "dates": []} # type: ignore[misc] +def _scan_bytes_timeout(content, filename, timeout=60, lang="dan+eng"): return {"cprs": [], "dates": []} # type: ignore[misc] def _detect_photo_faces(content, filename): return 0 # type: ignore[misc] def _extract_exif(content, filename): return {} # type: ignore[misc] def _extract_video_metadata(content, filename): return {} # type: ignore[misc] @@ -286,7 +286,7 @@ def run_file_scan(source: dict): result: dict = {"cprs": [], "dates": []} if ext not in PHOTO_EXTS and ext not in VIDEO_EXTS and ext not in AUDIO_EXTS: try: - result = _scan_bytes_timeout(content, rel_path) + result = _scan_bytes_timeout(content, rel_path, lang=ocr_lang) except Exception as e: broadcast("scan_error", {"file": rel_path, "error": str(e)}) continue @@ -476,6 +476,7 @@ def run_scan(options: dict): scan_photos = bool(scan_opts.get("scan_photos", False)) # biometric photo scan (#9) skip_gps_images= bool(scan_opts.get("skip_gps_images", False)) min_cpr_count = max(1, int(scan_opts.get("min_cpr_count", 1))) + ocr_lang = str(scan_opts.get("ocr_lang", "dan+eng")) or "dan+eng" scan_emails = bool(scan_opts.get("scan_emails", False)) scan_phones = bool(scan_opts.get("scan_phones", False)) @@ -1131,7 +1132,7 @@ def run_scan(options: dict): try: att_bytes = (conn.download_attachment_for(uid, msg_id, att["id"]) if uid != "me" else conn.download_attachment(msg_id, att["id"])) - att_result = _scan_bytes(att_bytes, att_name) + att_result = _scan_bytes(att_bytes, att_name, lang=ocr_lang) att_cprs = att_result.get("cprs", []) all_cprs.extend(att_cprs) if scan_emails: @@ -1183,7 +1184,7 @@ def run_scan(options: dict): # CPR/email/phone scan — skip for video and audio (metadata-only; no text layer) _media_only = ext in VIDEO_EXTS or ext in AUDIO_EXTS - result = {"cprs": [], "dates": [], "emails": [], "phones": []} if _media_only else _scan_bytes(content, name) + result = {"cprs": [], "dates": [], "emails": [], "phones": []} if _media_only else _scan_bytes(content, name, lang=ocr_lang) cprs = result.get("cprs", []) emails = result.get("emails", []) if scan_emails else [] phones = result.get("phones", []) if scan_phones else [] diff --git a/static/js/profiles.js b/static/js/profiles.js index 9db357c..91c18a9 100644 --- a/static/js/profiles.js +++ b/static/js/profiles.js @@ -137,6 +137,11 @@ function _applyProfile(profile) { if (el) el.value = opts.min_cpr_count; } + if (opts.ocr_lang !== undefined) { + const el = document.getElementById('optOcrLang'); + if (el) el.value = opts.ocr_lang; + } + if (opts.scan_emails !== undefined) { const el = document.getElementById('optScanEmails'); if (el) el.checked = opts.scan_emails; @@ -427,6 +432,7 @@ function _openEditorForProfile(profile) {