Add OCR language override setting

Operators can now choose Tesseract language pack(s) per profile via a
sidebar select (#optOcrLang) and profile editor (#peOptOcrLang). Presets:
dan+eng (default), dan, eng, dan+eng+deu, dan+eng+swe, dan+eng+fra. The
ocr_lang option flows from the UI through all three scan engines (M365
files/attachments, Google Drive, Gmail) down to document_scanner.scan_pdf
and scan_image — including the spawned PDF-OCR subprocess worker.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
StyxX65 2026-05-28 09:59:40 +02:00
parent 23b9555dcf
commit 2c5f5d3283
11 changed files with 49 additions and 16 deletions

View File

@ -11,6 +11,8 @@ Version numbers follow [Semantic Versioning](https://semver.org/spec/v2.0.0.html
### Added ### Added
- **OCR language override** — a new `ocr_lang` scan option (sidebar select `#optOcrLang`, profile editor `#peOptOcrLang`) lets operators choose the Tesseract language pack(s) used when scanning scanned PDFs and images. Presets: `dan+eng` (default), `dan`, `eng`, `dan+eng+deu`, `dan+eng+swe`, `dan+eng+fra`. The setting flows from the UI through the profile, into all three scan engines (M365 `_scan_bytes_timeout`, M365 attachments `_scan_bytes`, M365 files `_scan_bytes`, Google `_scan_bytes` for both Gmail and Drive). The `lang` parameter is threaded through `cpr_detector._scan_bytes``document_scanner.scan_pdf` / `scan_image` and the spawned PDF-OCR subprocess worker. The OCR cache key already included `lang`, so per-language results are cached independently.
- **Built-in file redaction for local files** — a scissor button (`✂`) appears on cards for local DOCX, XLSX, CSV, and TXT files. Clicking it rewrites the file in-place with all detected CPR numbers replaced by `██████-████` (DOCX/XLSX) or `█`-blocks (CSV/TXT), then removes the card from the grid and logs a `"redacted"` disposition. The redaction is atomic: a temp file in the same directory is written first and then moved over the original, so a crash never leaves a half-written file. Implemented in `routes/export.py` (`POST /api/redact_item`) using the existing `document_scanner` redact functions; front-end in `results.js` (`redactItem`) with the button hidden for non-local or unsupported-extension items and for resolved/viewer-mode cards. - **Built-in file redaction for local files** — a scissor button (`✂`) appears on cards for local DOCX, XLSX, CSV, and TXT files. Clicking it rewrites the file in-place with all detected CPR numbers replaced by `██████-████` (DOCX/XLSX) or `█`-blocks (CSV/TXT), then removes the card from the grid and logs a `"redacted"` disposition. The redaction is atomic: a temp file in the same directory is written first and then moved over the original, so a crash never leaves a half-written file. Implemented in `routes/export.py` (`POST /api/redact_item`) using the existing `document_scanner` redact functions; front-end in `results.js` (`redactItem`) with the button hidden for non-local or unsupported-extension items and for resolved/viewer-mode cards.
- **`DELETE /api/delete_item` route registration fix** — the `delete_item` handler in `routes/export.py` was missing its `@bp.route` decorator, so the endpoint was never registered in Flask's URL map. The route now works correctly. - **`DELETE /api/delete_item` route registration fix** — the `delete_item` handler in `routes/export.py` was missing its `@bp.route` decorator, so the endpoint was never registered in Flask's URL map. The route now works correctly.

View File

@ -586,7 +586,7 @@ def _find_emails_phones(text: str) -> dict:
} }
def _scan_bytes(content: bytes, filename: str, poppler_path=None) -> dict: def _scan_bytes(content: bytes, filename: str, poppler_path=None, lang: str = "dan+eng") -> dict:
"""Scan raw bytes for CPRs, emails, and phone numbers. Returns result dict.""" """Scan raw bytes for CPRs, emails, and phone numbers. Returns result dict."""
if not SCANNER_OK: if not SCANNER_OK:
return {"cprs": [], "dates": [], "emails": [], "phones": [], "error": "scanner not available"} return {"cprs": [], "dates": [], "emails": [], "phones": [], "error": "scanner not available"}
@ -608,7 +608,7 @@ def _scan_bytes(content: bytes, filename: str, poppler_path=None) -> dict:
return {"cprs": [], "dates": [], "emails": [], "phones": []} return {"cprs": [], "dates": [], "emails": [], "phones": []}
except Exception: except Exception:
pass # if pdfplumber fails, fall through to full scan_pdf pass # if pdfplumber fails, fall through to full scan_pdf
result = ds.scan_pdf(tmp_path, poppler_path=poppler_path) result = ds.scan_pdf(tmp_path, poppler_path=poppler_path, lang=lang)
elif ext in {".docx", ".doc"}: elif ext in {".docx", ".doc"}:
result = ds.scan_docx(tmp_path) result = ds.scan_docx(tmp_path)
elif ext in {".xlsx", ".xlsm"}: elif ext in {".xlsx", ".xlsm"}:
@ -620,7 +620,7 @@ def _scan_bytes(content: bytes, filename: str, poppler_path=None) -> dict:
cprs, dates = ds.extract_matches(text, 1, "text") cprs, dates = ds.extract_matches(text, 1, "text")
result = {"cprs": cprs, "dates": dates} result = {"cprs": cprs, "dates": dates}
elif ext in {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp"}: elif ext in {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp"}:
result = ds.scan_image(tmp_path) result = ds.scan_image(tmp_path, lang=lang)
else: else:
try: try:
text = content.decode("utf-8", errors="replace") text = content.decode("utf-8", errors="replace")
@ -640,17 +640,17 @@ def _scan_bytes(content: bytes, filename: str, poppler_path=None) -> dict:
result["phones"] = ep["phones"] result["phones"] = ep["phones"]
return result return result
def _worker_scan_pdf(pdf_path_str: str, result_q) -> None: def _worker_scan_pdf(pdf_path_str: str, result_q, lang: str = "dan+eng") -> None:
"""Worker executed in a spawned subprocess — must be a module-level function.""" """Worker executed in a spawned subprocess — must be a module-level function."""
try: try:
import document_scanner as _ds import document_scanner as _ds
from pathlib import Path as _Path from pathlib import Path as _Path
result_q.put(_ds.scan_pdf(_Path(pdf_path_str))) result_q.put(_ds.scan_pdf(_Path(pdf_path_str), lang=lang))
except Exception as e: except Exception as e:
result_q.put({"cprs": [], "dates": [], "error": str(e)}) result_q.put({"cprs": [], "dates": [], "error": str(e)})
def _scan_bytes_timeout(content: bytes, filename: str, timeout: int = 60) -> dict: def _scan_bytes_timeout(content: bytes, filename: str, timeout: int = 60, lang: str = "dan+eng") -> dict:
"""Like _scan_bytes but runs PDF scanning in a spawned subprocess with a hard timeout. """Like _scan_bytes but runs PDF scanning in a spawned subprocess with a hard timeout.
For non-PDF files delegates straight to _scan_bytes. For PDFs it writes the For non-PDF files delegates straight to _scan_bytes. For PDFs it writes the
@ -660,7 +660,7 @@ def _scan_bytes_timeout(content: bytes, filename: str, timeout: int = 60) -> dic
""" """
ext = Path(filename).suffix.lower() ext = Path(filename).suffix.lower()
if ext != ".pdf": if ext != ".pdf":
return _scan_bytes(content, filename) return _scan_bytes(content, filename, lang=lang)
import multiprocessing import multiprocessing
ctx = multiprocessing.get_context("spawn") ctx = multiprocessing.get_context("spawn")
@ -673,7 +673,7 @@ def _scan_bytes_timeout(content: bytes, filename: str, timeout: int = 60) -> dic
try: try:
with _pdf_subprocess_sem: with _pdf_subprocess_sem:
q = ctx.Queue() q = ctx.Queue()
p = ctx.Process(target=_worker_scan_pdf, args=(tmp_path_str, q)) p = ctx.Process(target=_worker_scan_pdf, args=(tmp_path_str, q, lang))
p.start() p.start()
p.join(timeout) p.join(timeout)
if p.is_alive(): if p.is_alive():

View File

@ -577,6 +577,8 @@
"m365_badge_emails": "e-mail", "m365_badge_emails": "e-mail",
"m365_badge_phones": "tlf.", "m365_badge_phones": "tlf.",
"m365_opt_min_cpr_hint": "Filer med færre distinkte CPR-numre end denne tærskel rapporteres ikke. Sæt til 2 for at undgå falske positive, når elever har egne CPR-numre i filer.", "m365_opt_min_cpr_hint": "Filer med færre distinkte CPR-numre end denne tærskel rapporteres ikke. Sæt til 2 for at undgå falske positive, når elever har egne CPR-numre i filer.",
"m365_opt_ocr_lang": "OCR-sprog",
"m365_opt_ocr_lang_hint": "Tesseract-sprogpakke(r) der bruges ved scanning af scannede PDF'er og billeder. Sprogpakker skal være installeret på serveren (f.eks. tesseract-ocr-dan). Flere pakker: dan+eng.",
"m365_filter_photo_only": "📷 Billeder / biometrisk", "m365_filter_photo_only": "📷 Billeder / biometrisk",
"m365_filter_all_roles": "Alle roller", "m365_filter_all_roles": "Alle roller",
"m365_filter_staff": "Ansatte", "m365_filter_staff": "Ansatte",

View File

@ -577,6 +577,8 @@
"m365_badge_emails": "E-Mail", "m365_badge_emails": "E-Mail",
"m365_badge_phones": "Tel.", "m365_badge_phones": "Tel.",
"m365_opt_min_cpr_hint": "Dateien mit weniger eindeutigen CPR-Nummern als dieser Schwellenwert werden nicht gemeldet. Auf 2 setzen, um Falsch-Positive zu vermeiden, wenn Schüler eigene CPR-Nummern in Dateien haben.", "m365_opt_min_cpr_hint": "Dateien mit weniger eindeutigen CPR-Nummern als dieser Schwellenwert werden nicht gemeldet. Auf 2 setzen, um Falsch-Positive zu vermeiden, wenn Schüler eigene CPR-Nummern in Dateien haben.",
"m365_opt_ocr_lang": "OCR-Sprache",
"m365_opt_ocr_lang_hint": "Tesseract-Sprachpaket(e) für das Scannen von gescannten PDFs und Bildern. Pakete müssen auf dem Server installiert sein (z.B. tesseract-ocr-dan). Mehrere Pakete: dan+eng.",
"m365_filter_photo_only": "📷 Fotos / biometrisch", "m365_filter_photo_only": "📷 Fotos / biometrisch",
"m365_filter_all_roles": "Alle Rollen", "m365_filter_all_roles": "Alle Rollen",
"m365_filter_staff": "Personal", "m365_filter_staff": "Personal",

View File

@ -577,6 +577,8 @@
"m365_badge_emails": "email", "m365_badge_emails": "email",
"m365_badge_phones": "phone", "m365_badge_phones": "phone",
"m365_opt_min_cpr_hint": "Files with fewer distinct CPR numbers than this threshold are not reported. Set to 2 to avoid false positives when students have their own CPR in documents.", "m365_opt_min_cpr_hint": "Files with fewer distinct CPR numbers than this threshold are not reported. Set to 2 to avoid false positives when students have their own CPR in documents.",
"m365_opt_ocr_lang": "OCR language",
"m365_opt_ocr_lang_hint": "Tesseract language pack(s) used when scanning scanned PDFs and images. Language packs must be installed on the server (e.g. tesseract-ocr-dan). Multiple packs: dan+eng.",
"m365_filter_photo_only": "📷 Photos / biometric", "m365_filter_photo_only": "📷 Photos / biometric",
"m365_filter_all_roles": "All roles", "m365_filter_all_roles": "All roles",
"m365_filter_staff": "Staff", "m365_filter_staff": "Staff",

View File

@ -143,6 +143,7 @@ def _run_google_scan(options: dict):
delta_enabled = bool(scan_opts.get("delta", False)) delta_enabled = bool(scan_opts.get("delta", False))
scan_emails = bool(scan_opts.get("scan_emails", False)) scan_emails = bool(scan_opts.get("scan_emails", False))
scan_phones = bool(scan_opts.get("scan_phones", False)) scan_phones = bool(scan_opts.get("scan_phones", False))
ocr_lang = str(scan_opts.get("ocr_lang", "dan+eng")) or "dan+eng"
from checkpoint import (_load_delta_tokens, _save_delta_tokens, from checkpoint import (_load_delta_tokens, _save_delta_tokens,
_save_checkpoint, _load_checkpoint, _clear_checkpoint) _save_checkpoint, _load_checkpoint, _clear_checkpoint)
@ -314,7 +315,7 @@ def _run_google_scan(options: dict):
meta["_body_excerpt"] = " ".join(_plain.split())[:500] meta["_body_excerpt"] = " ".join(_plain.split())[:500]
except Exception: except Exception:
meta["_body_excerpt"] = "" meta["_body_excerpt"] = ""
result = _scan_bytes(data, meta.get("name", "msg.txt")) result = _scan_bytes(data, meta.get("name", "msg.txt"), lang=ocr_lang)
except Exception as e: except Exception as e:
broadcast("scan_error", {"file": meta.get("name", ""), "error": str(e)}) broadcast("scan_error", {"file": meta.get("name", ""), "error": str(e)})
_g_scanned_ids.add(_item_id) _g_scanned_ids.add(_item_id)
@ -387,7 +388,7 @@ def _run_google_scan(options: dict):
try: try:
meta["_account"] = _display_name meta["_account"] = _display_name
meta["_source_type"] = "gdrive" meta["_source_type"] = "gdrive"
result = _scan_bytes(data, meta.get("name", "file")) result = _scan_bytes(data, meta.get("name", "file"), lang=ocr_lang)
except Exception as e: except Exception as e:
broadcast("scan_error", {"file": meta.get("name", ""), "error": str(e)}) broadcast("scan_error", {"file": meta.get("name", ""), "error": str(e)})
_g_scanned_ids.add(_item_id) _g_scanned_ids.add(_item_id)

View File

@ -110,8 +110,8 @@ AUDIO_EXTS: set = set()
SUPPORTED_EXTS: set = set() SUPPORTED_EXTS: set = set()
# cpr_detector helpers — injected by gdpr_scanner.py # cpr_detector helpers — injected by gdpr_scanner.py
def _scan_bytes(content, filename, poppler_path=None): return {"cprs": [], "dates": []} # type: ignore[misc] def _scan_bytes(content, filename, poppler_path=None, lang="dan+eng"): return {"cprs": [], "dates": []} # type: ignore[misc]
def _scan_bytes_timeout(content, filename, timeout=60): return {"cprs": [], "dates": []} # type: ignore[misc] def _scan_bytes_timeout(content, filename, timeout=60, lang="dan+eng"): return {"cprs": [], "dates": []} # type: ignore[misc]
def _detect_photo_faces(content, filename): return 0 # type: ignore[misc] def _detect_photo_faces(content, filename): return 0 # type: ignore[misc]
def _extract_exif(content, filename): return {} # type: ignore[misc] def _extract_exif(content, filename): return {} # type: ignore[misc]
def _extract_video_metadata(content, filename): return {} # type: ignore[misc] def _extract_video_metadata(content, filename): return {} # type: ignore[misc]
@ -286,7 +286,7 @@ def run_file_scan(source: dict):
result: dict = {"cprs": [], "dates": []} result: dict = {"cprs": [], "dates": []}
if ext not in PHOTO_EXTS and ext not in VIDEO_EXTS and ext not in AUDIO_EXTS: if ext not in PHOTO_EXTS and ext not in VIDEO_EXTS and ext not in AUDIO_EXTS:
try: try:
result = _scan_bytes_timeout(content, rel_path) result = _scan_bytes_timeout(content, rel_path, lang=ocr_lang)
except Exception as e: except Exception as e:
broadcast("scan_error", {"file": rel_path, "error": str(e)}) broadcast("scan_error", {"file": rel_path, "error": str(e)})
continue continue
@ -476,6 +476,7 @@ def run_scan(options: dict):
scan_photos = bool(scan_opts.get("scan_photos", False)) # biometric photo scan (#9) scan_photos = bool(scan_opts.get("scan_photos", False)) # biometric photo scan (#9)
skip_gps_images= bool(scan_opts.get("skip_gps_images", False)) skip_gps_images= bool(scan_opts.get("skip_gps_images", False))
min_cpr_count = max(1, int(scan_opts.get("min_cpr_count", 1))) min_cpr_count = max(1, int(scan_opts.get("min_cpr_count", 1)))
ocr_lang = str(scan_opts.get("ocr_lang", "dan+eng")) or "dan+eng"
scan_emails = bool(scan_opts.get("scan_emails", False)) scan_emails = bool(scan_opts.get("scan_emails", False))
scan_phones = bool(scan_opts.get("scan_phones", False)) scan_phones = bool(scan_opts.get("scan_phones", False))
@ -1131,7 +1132,7 @@ def run_scan(options: dict):
try: try:
att_bytes = (conn.download_attachment_for(uid, msg_id, att["id"]) att_bytes = (conn.download_attachment_for(uid, msg_id, att["id"])
if uid != "me" else conn.download_attachment(msg_id, att["id"])) if uid != "me" else conn.download_attachment(msg_id, att["id"]))
att_result = _scan_bytes(att_bytes, att_name) att_result = _scan_bytes(att_bytes, att_name, lang=ocr_lang)
att_cprs = att_result.get("cprs", []) att_cprs = att_result.get("cprs", [])
all_cprs.extend(att_cprs) all_cprs.extend(att_cprs)
if scan_emails: if scan_emails:
@ -1183,7 +1184,7 @@ def run_scan(options: dict):
# CPR/email/phone scan — skip for video and audio (metadata-only; no text layer) # CPR/email/phone scan — skip for video and audio (metadata-only; no text layer)
_media_only = ext in VIDEO_EXTS or ext in AUDIO_EXTS _media_only = ext in VIDEO_EXTS or ext in AUDIO_EXTS
result = {"cprs": [], "dates": [], "emails": [], "phones": []} if _media_only else _scan_bytes(content, name) result = {"cprs": [], "dates": [], "emails": [], "phones": []} if _media_only else _scan_bytes(content, name, lang=ocr_lang)
cprs = result.get("cprs", []) cprs = result.get("cprs", [])
emails = result.get("emails", []) if scan_emails else [] emails = result.get("emails", []) if scan_emails else []
phones = result.get("phones", []) if scan_phones else [] phones = result.get("phones", []) if scan_phones else []

View File

@ -137,6 +137,11 @@ function _applyProfile(profile) {
if (el) el.value = opts.min_cpr_count; if (el) el.value = opts.min_cpr_count;
} }
if (opts.ocr_lang !== undefined) {
const el = document.getElementById('optOcrLang');
if (el) el.value = opts.ocr_lang;
}
if (opts.scan_emails !== undefined) { if (opts.scan_emails !== undefined) {
const el = document.getElementById('optScanEmails'); const el = document.getElementById('optScanEmails');
if (el) el.checked = opts.scan_emails; if (el) el.checked = opts.scan_emails;
@ -427,6 +432,7 @@ function _openEditorForProfile(profile) {
<div class="pmgmt-opt-row"><span>${t('m365_opt_scan_photos','Søg efter ansigter i billeder')}</span><label class="toggle"><input type="checkbox" id="peOptPhotos" ${opts.scan_photos ? 'checked' : ''}><span class="toggle-slider"></span></label></div> <div class="pmgmt-opt-row"><span>${t('m365_opt_scan_photos','Søg efter ansigter i billeder')}</span><label class="toggle"><input type="checkbox" id="peOptPhotos" ${opts.scan_photos ? 'checked' : ''}><span class="toggle-slider"></span></label></div>
<div class="pmgmt-opt-row"><span>${t('m365_opt_skip_gps','Ignorer GPS i billeder')}</span><label class="toggle"><input type="checkbox" id="peOptSkipGps" ${opts.skip_gps_images ? 'checked' : ''}><span class="toggle-slider"></span></label></div> <div class="pmgmt-opt-row"><span>${t('m365_opt_skip_gps','Ignorer GPS i billeder')}</span><label class="toggle"><input type="checkbox" id="peOptSkipGps" ${opts.skip_gps_images ? 'checked' : ''}><span class="toggle-slider"></span></label></div>
<div class="pmgmt-opt-row"><span style="color:var(--muted)">${t('m365_opt_min_cpr','Min. CPR-antal pr. fil')}</span><input type="number" id="peOptMinCpr" value="${opts.min_cpr_count || 1}" min="1" max="50" style="width:46px;padding:3px 6px;font-size:11px;text-align:right"></div> <div class="pmgmt-opt-row"><span style="color:var(--muted)">${t('m365_opt_min_cpr','Min. CPR-antal pr. fil')}</span><input type="number" id="peOptMinCpr" value="${opts.min_cpr_count || 1}" min="1" max="50" style="width:46px;padding:3px 6px;font-size:11px;text-align:right"></div>
<div class="pmgmt-opt-row"><span style="color:var(--muted)">${t('m365_opt_ocr_lang','OCR-sprog')}</span><select id="peOptOcrLang" style="font-size:11px;padding:2px 4px;background:var(--surface);border:1px solid var(--border);color:var(--text);border-radius:4px"><option value="dan+eng" ${(opts.ocr_lang||'dan+eng')==='dan+eng'?'selected':''}>dan+eng</option><option value="dan" ${opts.ocr_lang==='dan'?'selected':''}>dan</option><option value="eng" ${opts.ocr_lang==='eng'?'selected':''}>eng</option><option value="dan+eng+deu" ${opts.ocr_lang==='dan+eng+deu'?'selected':''}>dan+eng+deu</option><option value="dan+eng+swe" ${opts.ocr_lang==='dan+eng+swe'?'selected':''}>dan+eng+swe</option><option value="dan+eng+fra" ${opts.ocr_lang==='dan+eng+fra'?'selected':''}>dan+eng+fra</option></select></div>
<div class="pmgmt-opt-row"><span>${t('m365_opt_scan_emails','Søg efter e-mailadresser')}</span><label class="toggle"><input type="checkbox" id="peOptEmails" ${opts.scan_emails ? 'checked' : ''}><span class="toggle-slider"></span></label></div> <div class="pmgmt-opt-row"><span>${t('m365_opt_scan_emails','Søg efter e-mailadresser')}</span><label class="toggle"><input type="checkbox" id="peOptEmails" ${opts.scan_emails ? 'checked' : ''}><span class="toggle-slider"></span></label></div>
<div class="pmgmt-opt-row"><span>${t('m365_opt_scan_phones','Søg efter telefonnumre')}</span><label class="toggle"><input type="checkbox" id="peOptPhones" ${opts.scan_phones ? 'checked' : ''}><span class="toggle-slider"></span></label></div> <div class="pmgmt-opt-row"><span>${t('m365_opt_scan_phones','Søg efter telefonnumre')}</span><label class="toggle"><input type="checkbox" id="peOptPhones" ${opts.scan_phones ? 'checked' : ''}><span class="toggle-slider"></span></label></div>
<hr style="border:none;border-top:1px solid var(--pmgmt-divider);margin:2px 0"> <hr style="border:none;border-top:1px solid var(--pmgmt-divider);margin:2px 0">
@ -645,6 +651,7 @@ async function _pmgmtSaveFullEdit() {
scan_photos: document.getElementById('peOptPhotos')?.checked ?? false, scan_photos: document.getElementById('peOptPhotos')?.checked ?? false,
skip_gps_images: document.getElementById('peOptSkipGps')?.checked ?? false, skip_gps_images: document.getElementById('peOptSkipGps')?.checked ?? false,
min_cpr_count: parseInt(document.getElementById('peOptMinCpr')?.value) || 1, min_cpr_count: parseInt(document.getElementById('peOptMinCpr')?.value) || 1,
ocr_lang: document.getElementById('peOptOcrLang')?.value || 'dan+eng',
scan_emails: document.getElementById('peOptEmails')?.checked ?? false, scan_emails: document.getElementById('peOptEmails')?.checked ?? false,
scan_phones: document.getElementById('peOptPhones')?.checked ?? false, scan_phones: document.getElementById('peOptPhones')?.checked ?? false,
}, },

View File

@ -127,6 +127,7 @@ function buildScanPayload() {
scan_photos: document.getElementById('optScanPhotos') ? document.getElementById('optScanPhotos').checked : false, scan_photos: document.getElementById('optScanPhotos') ? document.getElementById('optScanPhotos').checked : false,
skip_gps_images: document.getElementById('optSkipGps') ? document.getElementById('optSkipGps').checked : false, skip_gps_images: document.getElementById('optSkipGps') ? document.getElementById('optSkipGps').checked : false,
min_cpr_count: document.getElementById('optMinCpr') ? (parseInt(document.getElementById('optMinCpr').value) || 1) : 1, min_cpr_count: document.getElementById('optMinCpr') ? (parseInt(document.getElementById('optMinCpr').value) || 1) : 1,
ocr_lang: document.getElementById('optOcrLang')?.value || 'dan+eng',
scan_emails: document.getElementById('optScanEmails') ? document.getElementById('optScanEmails').checked : false, scan_emails: document.getElementById('optScanEmails') ? document.getElementById('optScanEmails').checked : false,
scan_phones: document.getElementById('optScanPhones') ? document.getElementById('optScanPhones').checked : false, scan_phones: document.getElementById('optScanPhones') ? document.getElementById('optScanPhones').checked : false,
retention_enabled: document.getElementById('optRetention') ? document.getElementById('optRetention').checked : false, retention_enabled: document.getElementById('optRetention') ? document.getElementById('optRetention').checked : false,

View File

@ -137,6 +137,21 @@ document.addEventListener('DOMContentLoaded', applyI18n);
style="width:46px;padding:3px 6px;font-size:11px;text-align:right"> style="width:46px;padding:3px 6px;font-size:11px;text-align:right">
</div> </div>
<!-- OCR language -->
<div class="toggle-row">
<span class="toggle-label" style="flex:1">
<span data-i18n="m365_opt_ocr_lang">OCR language</span><span class="hint-wrap"><span class="hint-icon" onclick="toggleHint(this)">?</span><span class="hint-bubble" data-i18n="m365_opt_ocr_lang_hint">Tesseract language pack(s) used when scanning scanned PDFs and images. Must match installed language packs.</span></span>
</span>
<select id="optOcrLang" style="font-size:11px;padding:2px 4px;background:var(--surface);border:1px solid var(--border);color:var(--text);border-radius:4px">
<option value="dan+eng">dan+eng</option>
<option value="dan">dan</option>
<option value="eng">eng</option>
<option value="dan+eng+deu">dan+eng+deu</option>
<option value="dan+eng+swe">dan+eng+swe</option>
<option value="dan+eng+fra">dan+eng+fra</option>
</select>
</div>
<!-- Scan for email addresses --> <!-- Scan for email addresses -->
<div class="toggle-row"> <div class="toggle-row">
<span class="toggle-label" style="flex:1"> <span class="toggle-label" style="flex:1">

View File

@ -193,7 +193,7 @@ class TestRunGoogleScan:
monkeypatch.setattr(gdpr_scanner, "broadcast", monkeypatch.setattr(gdpr_scanner, "broadcast",
lambda evt, data=None: events.append((evt, data or {}))) lambda evt, data=None: events.append((evt, data or {})))
monkeypatch.setattr(gdpr_scanner, "_scan_bytes", monkeypatch.setattr(gdpr_scanner, "_scan_bytes",
lambda data, name: scan_bytes_result or { lambda data, name, **kw: scan_bytes_result or {
"cprs": [], "pii_counts": None, "emails": [], "phones": [] "cprs": [], "pii_counts": None, "emails": [], "phones": []
}) })
monkeypatch.setattr(checkpoint, "_load_checkpoint", lambda *a, **kw: None) monkeypatch.setattr(checkpoint, "_load_checkpoint", lambda *a, **kw: None)