From fa6601ffdd5ef6fce8dbd10a953b4bc5040ce13d Mon Sep 17 00:00:00 2001 From: StyxX65 <150797939+StyxX65@users.noreply.github.com> Date: Mon, 1 Jun 2026 15:15:43 +0200 Subject: [PATCH] Bugfixes --- CLAUDE.md | 2 +- docs/manuals/MANUAL-DA.md | 2 ++ docs/manuals/MANUAL-EN.md | 2 ++ m365_connector.py | 9 ++++++--- scan_engine.py | 2 ++ scan_scheduler.py | 11 +++++++++++ static/js/scan.js | 2 ++ 7 files changed, 26 insertions(+), 4 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index f5a5c6f..8ce5cf2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -134,7 +134,7 @@ Large M365 tenants can generate enormous memory pressure. Key rules to preserve: - **Role-filtered exports** — `_build_excel_bytes(role='')` and `_build_article30_docx(role='')` accept `role='student'` or `role='staff'`. A local `_items` list is built at the top of each function and used everywhere instead of `state.flagged_items` directly — GPS sheet, External transfers sheet, and Art.30 staff/student tables all see only the filtered subset. Route handlers read `request.args.get('role', '')` and forward it. Filenames get `_elever` / `_ansatte` suffix. The `#filterRole` dropdown in the filter bar drives both the client-side grid filter and the export URL param — do not separate them. - **`POST /api/redact_item`** — rewrites a file in-place with CPR numbers replaced by `██████-████` / `█` blocks, removes the card from the grid, and logs a `"redacted"` disposition. Supported source types and extensions: - **`local`** — DOCX, XLSX, CSV, TXT, PDF. File is written to a temp path in the same directory then `shutil.move`d (avoids cross-device rename). - - **`onedrive` / `sharepoint` / `teams`** — DOCX, XLSX, PDF. Downloaded via Graph, redacted locally, re-uploaded via `put_drive_item_content()` (PUT with `Content-Type: application/octet-stream`). + - **`onedrive` / `sharepoint` / `teams`** — DOCX, XLSX, PDF. Downloaded via Graph, redacted locally, re-uploaded via `put_drive_item_content()` (PUT with `Content-Type: application/octet-stream`). Requires `Files.ReadWrite.All` — `SCOPES` in `m365_connector.py` now requests this instead of `Files.Read.All` (superset; scanning still works). Delegated-auth users must re-authenticate after upgrading; app-only tenants need the admin to grant `Files.ReadWrite.All` application permission and re-consent. - **`gdrive`** — DOCX, XLSX, PDF. MIME type checked first — Google-native Docs/Sheets (exported as DOCX during scan) are refused with a clear message. Downloaded via `download_drive_file_by_id()`, redacted, uploaded back via `update_drive_file()` (`files().update()`). Requires `drive` scope (not `drive.readonly`) on the service-account delegation. - **`sftp`** — DOCX, XLSX, CSV, TXT, PDF. Source config matched from `_load_file_sources()` by `sftp_host` + `sftp_user` parsed from `item_meta["account_name"]` (the `sftp://user@host/root` URI). Requires the item to still be in `state.flagged_items` — `account_name` is not persisted to the DB. Read/write via `SFTPScanner.read_file()` / `write_file()`. - **`smb`** — DOCX, XLSX, CSV, TXT, PDF. Host + share parsed from `full_path` (`//host/share/…`); source config matched from `_load_file_sources()`. Written back via `file_scanner.write_smb_file()` with `CreateDisposition.FILE_SUPERSEDE`. diff --git a/docs/manuals/MANUAL-DA.md b/docs/manuals/MANUAL-DA.md index 36a2a44..5520ec5 100644 --- a/docs/manuals/MANUAL-DA.md +++ b/docs/manuals/MANUAL-DA.md @@ -310,6 +310,8 @@ Knappen er **ikke** tilgængelig for e-mail-elementer (Exchange/Gmail) eller i v > **PDF-sikkerhedsnote:** PDF-redigering sker fysisk — CPR-nummerteksten slettes fra PDF-datastrømmen og er ikke blot dækket over med en sort boks. En læser kan ikke gendanne den oprindelige tekst ved at markere under redigeringen eller ved programmatisk inspektion af filen. Billedbaserede (scannede) PDF-filer understøttes også: scanneren lokaliserer CPR-nummeret på sidebilledet via OCR og overskriver det pågældende område fysisk. +> **OneDrive / SharePoint / Teams-note:** Redigering skriver den ændrede fil tilbage via Microsoft Graph API og kræver tilladelsen `Files.ReadWrite.All`. Scanneren anmoder nu automatisk om denne tilladelse ved login. Hvis du har godkendt før denne opdatering, skal du logge ud og logge ind igen (Indstillinger → Microsoft 365 → Log ud), så scanneren henter et nyt token med skriveadgang. Ved app-only-opsætninger (serviceprincipal) skal en Global Administrator tildele applikationstilladelsen `Files.ReadWrite.All` i Azure → App-registreringer → API-tilladelser → Giv administratorsamtykke. + > **Google Drev-note:** Redigering i Google Drev kræver `drive`-scopet på servicekontoens domain-wide delegation (ikke blot `drive.readonly`). Hvis redigeringen fejler med en rettighedsfejl, bedes du kontakte din Google Workspace-administrator for at tilføje scopet `https://www.googleapis.com/auth/drive` til servicekontoens delegation i Admin Console. > **SFTP-note:** SFTP-redigering er kun tilgængelig for elementer fundet i den aktuelle scansession. Gennemfør en ny scanning, hvis du gennemser historiske resultater. diff --git a/docs/manuals/MANUAL-EN.md b/docs/manuals/MANUAL-EN.md index 24293d6..709ce3c 100644 --- a/docs/manuals/MANUAL-EN.md +++ b/docs/manuals/MANUAL-EN.md @@ -310,6 +310,8 @@ The button is **not** available for email items (Exchange/Gmail) or viewer mode. > **PDF security note:** PDF redaction uses physical removal — the CPR number text is erased from the PDF data stream, not just painted over with a black box. A reader cannot recover the original text by selecting under the redaction or inspecting the file programmatically. Image-based (scanned) PDFs are also supported: the scanner locates the CPR number on the page image via OCR and physically overwrites that region. +> **OneDrive / SharePoint / Teams note:** Redaction writes the modified file back via the Microsoft Graph API and requires the `Files.ReadWrite.All` permission. The scanner now requests this permission automatically during sign-in. If you authenticated before this update, sign out and sign back in (Settings → Microsoft 365 → Sign out) so the scanner obtains a new token with write access. For app-only (service principal) setups, a Global Admin must grant the `Files.ReadWrite.All` application permission in Azure → App registrations → API permissions → Grant admin consent. + > **Google Drive note:** Drive redaction requires the `drive` scope on the service account's domain-wide delegation grant (not just `drive.readonly`). If redaction fails with a permission error, ask your Google Workspace admin to add the `https://www.googleapis.com/auth/drive` scope to the service account delegation in the Admin Console. > **SFTP note:** SFTP redaction is only available for items found in the current scan session. If you are browsing historical results, re-run the scan first. diff --git a/m365_connector.py b/m365_connector.py index 94b150d..df5d982 100644 --- a/m365_connector.py +++ b/m365_connector.py @@ -39,9 +39,11 @@ except ImportError: GRAPH_BASE = "https://graph.microsoft.com/v1.0" # Delegated scopes — used when signing in as a specific user (device code flow) +# Files.ReadWrite.All is a superset of Files.Read.All; required for in-place +# OneDrive/SharePoint/Teams redaction (PUT /drives/{id}/items/{id}/content). SCOPES = [ "Mail.Read", - "Files.Read.All", + "Files.ReadWrite.All", "Sites.Read.All", "Team.ReadBasic.All", "ChannelMessage.Read.All", @@ -82,8 +84,9 @@ class M365PermissionError(M365Error): f"to access this resource.\n" f" Path: {path}\n" f" Fix: the signed-in user must be a Global/Exchange Admin, OR an admin must " - f"grant Application permissions (Mail.Read, Files.Read.All, Sites.Read.All) " - f"in Azure → App registrations → API permissions → Grant admin consent." + f"grant Application permissions (Mail.Read, Files.ReadWrite.All, Sites.Read.All) " + f"in Azure → App registrations → API permissions → Grant admin consent.\n" + f" Note: Files.ReadWrite.All (not Files.Read.All) is required for file redaction." ) diff --git a/scan_engine.py b/scan_engine.py index af5cf76..bdc0e48 100644 --- a/scan_engine.py +++ b/scan_engine.py @@ -184,6 +184,8 @@ def run_file_scan(source: dict): min_cpr_count = max(1, int(source.get("min_cpr_count", 1))) scan_emails = bool(source.get("scan_emails", False)) scan_phones = bool(source.get("scan_phones", False)) + cpr_only = bool(source.get("cpr_only", False)) + ocr_lang = str(source.get("ocr_lang", "dan+eng")) or "dan+eng" max_mb = int(source.get("max_file_mb", 50)) if source_kind == "sftp": diff --git a/scan_scheduler.py b/scan_scheduler.py index 7dcb4e6..df1ec03 100644 --- a/scan_scheduler.py +++ b/scan_scheduler.py @@ -340,6 +340,16 @@ class ScanScheduler: # Fire file scan for each file source in the profile # file_sources may be IDs (strings) or full dicts — resolve either _all_file_sources = {s["id"]: s for s in (_m._load_file_sources() or []) if isinstance(s, dict)} + # Merge per-scan options from the profile so the file scan honours + # cpr_only/ocr_lang/scan_photos/etc. (the browser does this in + # startScan(); the scheduler must mirror it). + _profile_opts = options.get("options", {}) or {} + _FS_OPT_KEYS = ( + "scan_photos", "skip_gps_images", "min_cpr_count", + "scan_emails", "scan_phones", "cpr_only", "ocr_lang", + "max_file_mb", + ) + _fs_extra = {k: _profile_opts[k] for k in _FS_OPT_KEYS if k in _profile_opts} for fs in options.get("file_sources", []): # Resolve string IDs to full source dicts if isinstance(fs, str): @@ -347,6 +357,7 @@ class ScanScheduler: if not isinstance(fs, dict) or not fs.get("path"): logger.warning("[scheduler] skipping invalid file source: %r", fs) continue + fs = {**fs, **_fs_extra} try: _m.run_file_scan(fs) except Exception as _fse: diff --git a/static/js/scan.js b/static/js/scan.js index 7d6f9da..9f8a7d1 100644 --- a/static/js/scan.js +++ b/static/js/scan.js @@ -623,6 +623,8 @@ function startScan(resume) { min_cpr_count: options.min_cpr_count || 1, scan_emails: options.scan_emails || false, scan_phones: options.scan_phones || false, + cpr_only: options.cpr_only || false, + ocr_lang: options.ocr_lang || 'dan+eng', })) }).then(r => { if (r.status === 409) { _onScanConflict('file'); }