From 034ced943e28aee7bc81482538b6d84255be9fec Mon Sep 17 00:00:00 2001
From: StyxX65 <150797939+StyxX65@users.noreply.github.com>
Date: Thu, 28 May 2026 17:47:02 +0200
Subject: [PATCH] =?UTF-8?q?Extended=20document=20redaction=20to=20Google?=
=?UTF-8?q?=20Drive,=20SFTP,=20SMB,=20and=20local=20PDFs=20=20=20=20=20=20?=
=?UTF-8?q?Extends=20the=20=E2=9C=82=20in-place=20redaction=20feature=20be?=
=?UTF-8?q?yond=20local=20DOCX/XLSX/CSV/TXT=20files=20=20=20to=20cover=20a?=
=?UTF-8?q?ll=20remaining=20file=20source=20types=20and=20adds=20PDF=20sup?=
=?UTF-8?q?port=20for=20local=20files.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
CHANGELOG.md | 8 +
CLAUDE.md | 3 +-
TODO.md | 19 ++
docs/manuals/MANUAL-DA.md | 4 +-
docs/manuals/MANUAL-EN.md | 4 +-
file_scanner.py | 62 +++++
google_connector.py | 67 +++++
m365_connector.py | 44 ++++
routes/export.py | 521 +++++++++++++++++++++++++++++++++-----
sftp_connector.py | 47 ++++
static/js/results.js | 13 +-
11 files changed, 723 insertions(+), 69 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 210bb40..f11cc66 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,14 @@ Version numbers follow [Semantic Versioning](https://semver.org/spec/v2.0.0.html
### Added
+- **PDF redaction for local files** — the ✂ redact button now works on local PDF files in addition to DOCX, XLSX, CSV, and TXT. Text-based PDFs are redacted using PyMuPDF's physical redaction (`page.apply_redactions()`), which removes the underlying text data from the PDF stream — not just paints over it. Scanned/image-based PDFs go through the OCR bbox path: CPR positions are found via Tesseract then physically painted and sanitised. Falls back to a reportlab overlay if PyMuPDF is not installed; raises a clear error if both libraries are absent.
+
+- **Google Drive file redaction** — the ✂ redact button now works on native DOCX, XLSX, and PDF files stored in Google Drive (both Google Workspace service-account and personal OAuth connectors). The file is downloaded via the Drive API, redacted locally using the same PyMuPDF / python-docx / openpyxl pipeline as local files, then uploaded back as a new revision via `files().update()`. Google Docs/Sheets exported as DOCX are detected by MIME type and refused with a clear message (re-upload after exporting manually). Requires the `drive` scope (not `drive.readonly`) on the service-account domain-wide delegation grant; a 403 surfaces the exact Google error so admins can add the scope. Methods added: `get_drive_file_mime`, `download_drive_file_by_id`, `update_drive_file` on both `GoogleWorkspaceConnector` and `PersonalGoogleConnector`.
+
+- **SFTP file redaction** — the ✂ button now works on SFTP files (DOCX, XLSX, CSV, TXT, PDF). The file is downloaded via paramiko, redacted locally, then written back with `sftp.open(path, "wb")`. Source config is matched from `_load_file_sources()` by host + username; credentials are resolved from the keychain via `_resolve_sftp_credentials`. Requires the item to be in the current session's `state.flagged_items` (SFTP host info is not stored in the DB). New method: `SFTPScanner.write_file(remote_path, content)`.
+
+- **SMB file redaction** — the ✂ button now works on SMB/CIFS network share files (DOCX, XLSX, CSV, TXT, PDF). Source config is looked up by matching the host parsed from `full_path` (`//host/share/…`). File is downloaded and re-uploaded using smbprotocol with `CreateDisposition.FILE_SUPERSEDE` so the file is atomically replaced. New function: `file_scanner.write_smb_file(path, content, username, password, domain)`.
+
- **AI-enhanced NER via Claude** — Named Entity Recognition (names, addresses, organisations) can now be powered by Claude Haiku instead of spaCy. Enable in **Settings → AI / NER**: paste an Anthropic API key, toggle on, click Test to confirm. When enabled, `document_scanner.py` calls the Claude API (`claude-haiku-4-5-20251001`) instead of spaCy for all three scan engines; results are cached in-memory per document (bounded at 2 000 entries) so repeated scans of the same file never re-charge the API. Falls back to spaCy automatically if the key is missing or the `anthropic` package is not installed. API key stored in `config.json` under `claude_api_key`; toggle stored under `claude_ner`. Routes: `GET/POST /api/settings/claude`, `POST /api/settings/claude/test`.
### Fixed
diff --git a/CLAUDE.md b/CLAUDE.md
index 4f18649..6162762 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -130,7 +130,8 @@ Large M365 tenants can generate enormous memory pressure. Key rules to preserve:
- **Excel Summary sheet vs. per-source tabs** — the Summary sheet shows all scanned sources (even with 0 items). Per-source tabs are only created for sources with items; an empty tab has no value.
- **ART.30 breakdown table** — iterates `scanned_sources` (not `by_source`) so Gmail, Google Drive, etc. appear with `0 | 0 | 0 | —` when the scan found nothing.
- **Role-filtered exports** — `_build_excel_bytes(role='')` and `_build_article30_docx(role='')` accept `role='student'` or `role='staff'`. A local `_items` list is built at the top of each function and used everywhere instead of `state.flagged_items` directly — GPS sheet, External transfers sheet, and Art.30 staff/student tables all see only the filtered subset. Route handlers read `request.args.get('role', '')` and forward it. Filenames get `_elever` / `_ansatte` suffix. The `#filterRole` dropdown in the filter bar drives both the client-side grid filter and the export URL param — do not separate them.
-- **`POST /api/redact_item`** — rewrites a local file in-place with CPR numbers replaced by `██████-████` / `█` blocks, then removes the card from the grid and logs a `"redacted"` disposition. Supported extensions: `.docx`, `.xlsx`, `.csv`, `.txt` (`_REDACT_EXTS`). The file is written to a temp path in the **same directory** as the original before `shutil.move` — this avoids cross-device rename failures on mounted volumes. Uses existing `document_scanner` functions (`redact_docx`, `redact_xlsx`, `redact_csv`, `find_pii_spans_in_text`). Only works for `source_type == "local"` — SMB/cloud files are not supported (button is hidden on those cards). The button (`✂`, class `card-redact-btn`) appears in `appendCard` when `_redactable(f)` is true; hidden in viewer mode and for resolved items.
+- **`POST /api/redact_item`** — rewrites a local file in-place with CPR numbers replaced by `██████-████` / `█` blocks, then removes the card from the grid and logs a `"redacted"` disposition. Supported extensions: `.docx`, `.xlsx`, `.csv`, `.txt`, `.pdf` (`_REDACT_EXTS`). The file is written to a temp path in the **same directory** as the original before `shutil.move` — this avoids cross-device rename failures on mounted volumes. Uses existing `document_scanner` functions (`redact_docx`, `redact_xlsx`, `redact_csv`, `find_pii_spans_in_text`, `scan_pdf`, `redact_pdf_secure`). Only works for `source_type == "local"` — SMB/cloud files are not supported (button is hidden on those cards). The button (`✂`, class `card-redact-btn`) appears in `appendCard` when `_redactable(f)` is true; hidden in viewer mode and for resolved items.
+- **PDF redaction** — `redact_pdf_secure` uses PyMuPDF `page.apply_redactions()` which physically removes text data from the PDF stream (not just an overlay). Falls back to `redact_pdf` (reportlab overlay) if PyMuPDF is absent. Text-based pages use `find_cpr_char_bboxes`; scanned pages render via OCR at 200 DPI and use `find_cpr_image_bboxes`. Raises `RuntimeError` if both backends are unavailable. Do not add `.pdf` to `_redactExts` in `results.js` without also handling it in `export.py` — the button and the route must stay in sync.
## Scan history browser — static/js/history.js + gdpr_db.py + routes/database.py
diff --git a/TODO.md b/TODO.md
index d5b7faa..78487cd 100644
--- a/TODO.md
+++ b/TODO.md
@@ -181,6 +181,25 @@ Extended the M365 checkpoint/resume mechanism to all three scan engines. Each en
---
+### Extended document anonymisation (redaction beyond local DOCX/XLSX/CSV/TXT)
+
+Currently the ✂ redact button only works for local files with extensions `.docx`, `.xlsx`, `.csv`, `.txt`. Several valuable cases are not yet covered:
+
+**1. PDF redaction for local files** ✅ — `redact_pdf_secure` (PyMuPDF physical redaction) wired to `_REDACT_EXTS` and the ✂ button. Falls back to reportlab overlay if PyMuPDF is absent.
+
+**2. OneDrive / SharePoint / Teams file redaction** ✅ — `put_drive_item_content()` added to `m365_connector.py`; `redact_item()` in `routes/export.py` extended with a cloud branch: download via Graph, redact to a local temp file, re-upload via PUT. Supports DOCX, XLSX, PDF. ✂ button shown on cloud cards with supported extensions.
+
+**3. Google Drive file redaction** ✅ — `get_drive_file_mime`, `download_drive_file_by_id`, `update_drive_file` added to both `GoogleWorkspaceConnector` and `PersonalGoogleConnector`. `redact_item()` extended with a `gdrive` branch: check MIME type (rejects Google Docs/Sheets), download bytes, redact locally, upload back via `files().update()`. Requires `drive` scope (not `drive.readonly`) on the service-account delegation. ✂ button shown on Drive cards with DOCX/XLSX/PDF extension.
+
+**4. SMB / SFTP file redaction** ✅ — `write_file(remote_path, content)` added to `SFTPScanner`; `write_smb_file(path, content, user, password, domain)` added to `file_scanner.py`. `redact_item()` extended with `sftp` and `smb` branches: download via native protocol, redact locally, write back. Source config matched from `_load_file_sources()`. SFTP requires the item to still be in `state.flagged_items` (in-session only). ✂ button shown on SMB/SFTP cards with DOCX/XLSX/CSV/TXT/PDF extension.
+
+**5. Email body redaction (Exchange / Gmail)** — overwrite the message body via Graph `PATCH /messages/{id}` or Gmail API. High effort and high risk: HTML formatting must be preserved, inline images handled, and a mistake permanently corrupts the email. **Recommendation: skip** — deleting the email is a safer and simpler GDPR response for emails containing CPR numbers.
+
+**Priority order:** PDF (1) first since it reuses existing code. Cloud files (2–4) on demand.
+**Size:** Small (PDF) · Medium (cloud/SMB/SFTP) · **Priority:** Medium
+
+---
+
### #32 — Windowed mode for Profiles, Sources, and Settings ✗ Won't do
The workflow is sequential (configure → scan → review), not parallel — there is no realistic scenario where a modal and the results grid need to be open simultaneously. The Sources panel is already visible in the sidebar. Option A (the least-work path) still loads the full 3800-line JS stack twice. Closed.
diff --git a/docs/manuals/MANUAL-DA.md b/docs/manuals/MANUAL-DA.md
index b047a74..f7cc665 100644
--- a/docs/manuals/MANUAL-DA.md
+++ b/docs/manuals/MANUAL-DA.md
@@ -294,7 +294,9 @@ Klik på **Gem** efter valget. En lille **✓ Gemt**-bekræftelse vises.
### Redigér en lokal fil
-For lokale DOCX-, XLSX-, CSV- og TXT-filer vises en **✂**-knap på kortet. Klikker du på den, overskrives filen på stedet, og alle CPR-numre erstattes med `██████-████`-blokke. Kortet fjernes fra gitteret, og handlingen registreres som en `"redacted"`-disposition. Brug denne mulighed, når du ønsker at anonymisere en fil frem for at slette den helt. Knappen er ikke tilgængelig for e-mails, cloud-filer eller SFTP-filer.
+For lokale DOCX-, XLSX-, CSV-, TXT- og PDF-filer vises en **✂**-knap på kortet. Klikker du på den, overskrives filen på stedet, og alle CPR-numre erstattes med `██████-████`-blokke. Kortet fjernes fra gitteret, og handlingen registreres som en `"redacted"`-disposition. Brug denne mulighed, når du ønsker at anonymisere en fil frem for at slette den helt. Knappen er ikke tilgængelig for e-mails, cloud-filer eller SFTP-filer.
+
+> **PDF-sikkerhedsnote:** PDF-redigering sker fysisk — CPR-nummerteksten slettes fra PDF-datastrømmen og er ikke blot dækket over med en sort boks. En læser kan ikke gendanne den oprindelige tekst ved at markere under redigeringen eller ved programmatisk inspektion af filen. Billedbaserede (scannede) PDF-filer understøttes også: scanneren lokaliserer CPR-nummeret på sidebilledet via OCR og overskriver det pågældende område fysisk.
### Massemarkering af flere elementer på én gang
diff --git a/docs/manuals/MANUAL-EN.md b/docs/manuals/MANUAL-EN.md
index f05d1c4..2f1cf6e 100644
--- a/docs/manuals/MANUAL-EN.md
+++ b/docs/manuals/MANUAL-EN.md
@@ -294,7 +294,9 @@ After choosing, click **Save**. A small **✓ Saved** confirmation appears.
### Redacting a local file
-For local DOCX, XLSX, CSV, and TXT files a **✂** button appears in the card. Clicking it rewrites the file in-place, replacing all CPR numbers with `██████-████` blocks. The card is removed from the grid and the action is logged as a `"redacted"` disposition. This is useful when you want to sanitise a file rather than delete it entirely. The button is not available for email items, cloud files, or SFTP files.
+For local DOCX, XLSX, CSV, TXT, and PDF files a **✂** button appears in the card. Clicking it rewrites the file in-place, replacing all CPR numbers with `██████-████` blocks. The card is removed from the grid and the action is logged as a `"redacted"` disposition. This is useful when you want to sanitise a file rather than delete it entirely. The button is not available for email items, cloud files, or SFTP files.
+
+> **PDF security note:** PDF redaction uses physical removal — the CPR number text is erased from the PDF data stream, not just painted over with a black box. A reader cannot recover the original text by selecting under the redaction or inspecting the file programmatically. Image-based (scanned) PDFs are also supported: the scanner locates the CPR number on the page image via OCR and physically overwrites that region.
### Bulk tagging multiple items at once
diff --git a/file_scanner.py b/file_scanner.py
index 497021a..df87b94 100644
--- a/file_scanner.py
+++ b/file_scanner.py
@@ -551,6 +551,68 @@ def _smb_read_file(tree, smb_path: str) -> bytes:
fh.close(get_attributes=False)
+def write_smb_file(smb_path_uri: str, content: bytes,
+ username: str, password: str, domain: str = "") -> None:
+ """Overwrite an SMB file at smb_path_uri (e.g. '//host/share/folder/file.docx').
+
+ Raises RuntimeError if smbprotocol is not installed.
+ Raises ValueError if the path cannot be parsed.
+ All SMB errors propagate as-is.
+ """
+ if not SMB_OK:
+ raise RuntimeError("smbprotocol not installed — run: pip install smbprotocol")
+
+ norm = smb_path_uri.replace("\\", "/").lstrip("/")
+ parts = norm.split("/", 2)
+ if len(parts) < 2:
+ raise ValueError(f"Cannot parse SMB path '{smb_path_uri}' — expected //host/share[/path]")
+ host = parts[0]
+ share = parts[1]
+ file_rel = parts[2].replace("/", "\\") if len(parts) > 2 else ""
+
+ if not host or not share or not file_rel:
+ raise ValueError(f"Cannot parse SMB path '{smb_path_uri}'")
+
+ import uuid as _uuid
+ conn = Connection(_uuid.uuid4(), host, 445)
+ conn.connect(timeout=30)
+ try:
+ session = Session(conn, username=username, password=password,
+ require_encryption=False)
+ if domain:
+ session.username = f"{domain}\\{username}"
+ session.connect()
+ try:
+ tree = TreeConnect(session, f"\\\\{host}\\{share}")
+ tree.connect()
+ try:
+ fh = Open(tree, file_rel)
+ fh.create(
+ ImpersonationLevel.Impersonation,
+ FilePipePrinterAccessMask.FILE_WRITE_DATA |
+ FilePipePrinterAccessMask.FILE_WRITE_ATTRIBUTES,
+ FileAttributes.FILE_ATTRIBUTE_NORMAL,
+ ShareAccess.FILE_SHARE_NONE,
+ CreateDisposition.FILE_SUPERSEDE,
+ CreateOptions.FILE_NON_DIRECTORY_FILE,
+ )
+ try:
+ chunk_size = 1024 * 1024
+ offset = 0
+ while offset < len(content):
+ chunk = content[offset:offset + chunk_size]
+ fh.write(chunk, offset)
+ offset += len(chunk)
+ finally:
+ fh.close(get_attributes=False)
+ finally:
+ tree.disconnect()
+ finally:
+ session.disconnect()
+ finally:
+ conn.disconnect()
+
+
def _smb_ts(windows_ts: int) -> str:
"""Convert Windows FILETIME (100ns intervals since 1601-01-01) to YYYY-MM-DD."""
if not windows_ts:
diff --git a/google_connector.py b/google_connector.py
index 5bd8228..5451ac6 100644
--- a/google_connector.py
+++ b/google_connector.py
@@ -70,6 +70,9 @@ GMAIL_SCOPES = [
DRIVE_SCOPES = [
"https://www.googleapis.com/auth/drive.readonly",
]
+DRIVE_WRITE_SCOPES = [
+ "https://www.googleapis.com/auth/drive",
+]
ADMIN_SCOPES = [
"https://www.googleapis.com/auth/admin.directory.user.readonly",
]
@@ -284,6 +287,26 @@ class GoogleConnector:
raise GoogleError(f"Drive auth failed for {user_email}: {e}") from e
return _drive_changes_collect(service, user_email, page_token, max_files, max_file_mb)
+ # ── Drive write-back (redaction) ──────────────────────────────────────────
+
+ def get_drive_file_mime(self, user_email: str, file_id: str) -> str:
+ """Return the mimeType of a Drive file."""
+ creds = self._creds_for(user_email, DRIVE_WRITE_SCOPES)
+ service = build("drive", "v3", credentials=creds, cache_discovery=False)
+ return _get_drive_file_mime(service, file_id)
+
+ def download_drive_file_by_id(self, user_email: str, file_id: str) -> bytes:
+ """Download raw bytes of a non-Google-native Drive file by ID."""
+ creds = self._creds_for(user_email, DRIVE_WRITE_SCOPES)
+ service = build("drive", "v3", credentials=creds, cache_discovery=False)
+ return _download_drive_file_by_id(service, file_id)
+
+ def update_drive_file(self, user_email: str, file_id: str, content: bytes, mime_type: str) -> None:
+ """Replace Drive file content in-place. Requires drive (not drive.readonly) scope."""
+ creds = self._creds_for(user_email, DRIVE_WRITE_SCOPES)
+ service = build("drive", "v3", credentials=creds, cache_discovery=False)
+ _update_drive_file_content(service, file_id, content, mime_type)
+
# ── Persistence helpers ───────────────────────────────────────────────────────
@@ -507,6 +530,30 @@ def _download_drive_file(
return None
+def _get_drive_file_mime(service, file_id: str) -> str:
+ """Return the mimeType of a Drive file."""
+ info = service.files().get(fileId=file_id, fields="mimeType").execute()
+ return info.get("mimeType", "")
+
+
+def _download_drive_file_by_id(service, file_id: str) -> bytes:
+ """Download raw bytes of a non-Google-native Drive file by ID."""
+ req = service.files().get_media(fileId=file_id)
+ buf = io.BytesIO()
+ dl = MediaIoBaseDownload(buf, req, chunksize=4 * 1024 * 1024)
+ done = False
+ while not done:
+ _, done = dl.next_chunk()
+ return buf.getvalue()
+
+
+def _update_drive_file_content(service, file_id: str, content: bytes, mime_type: str) -> None:
+ """Replace a Drive file's content in-place."""
+ from googleapiclient.http import MediaInMemoryUpload
+ media = MediaInMemoryUpload(content, mimetype=mime_type, resumable=False)
+ service.files().update(fileId=file_id, media_body=media).execute()
+
+
def _drive_iter(
service,
user_email: str,
@@ -743,6 +790,26 @@ class PersonalGoogleConnector:
raise GoogleError(f"Drive auth failed: {e}") from e
return _drive_changes_collect(service, user_email, page_token, max_files, max_file_mb)
+ # ── Drive write-back (redaction) ──────────────────────────────────────────
+
+ def get_drive_file_mime(self, user_email: str, file_id: str) -> str:
+ """Return the mimeType of a Drive file."""
+ self._refresh_if_needed()
+ service = build("drive", "v3", credentials=self._creds, cache_discovery=False)
+ return _get_drive_file_mime(service, file_id)
+
+ def download_drive_file_by_id(self, user_email: str, file_id: str) -> bytes:
+ """Download raw bytes of a non-Google-native Drive file by ID."""
+ self._refresh_if_needed()
+ service = build("drive", "v3", credentials=self._creds, cache_discovery=False)
+ return _download_drive_file_by_id(service, file_id)
+
+ def update_drive_file(self, user_email: str, file_id: str, content: bytes, mime_type: str) -> None:
+ """Replace Drive file content in-place. Requires drive (not drive.readonly) scope."""
+ self._refresh_if_needed()
+ service = build("drive", "v3", credentials=self._creds, cache_discovery=False)
+ _update_drive_file_content(service, file_id, content, mime_type)
+
@staticmethod
def get_device_code_flow(client_id: str, client_secret: str) -> dict:
"""
diff --git a/m365_connector.py b/m365_connector.py
index 7941351..94b150d 100644
--- a/m365_connector.py
+++ b/m365_connector.py
@@ -885,6 +885,50 @@ class M365Connector:
url = f"{GRAPH_BASE}/drives/{drive_id}/items/{item_id}/content"
return self._get_bytes(url)
+ def put_drive_item_content(self, drive_id: str, item_id: str, content: bytes,
+ user_id: str = "") -> None:
+ """Replace file content via Graph. Tries drives/{drive_id} first; falls back
+ to users/{user_id}/drive when drive_id is absent, then /me/drive."""
+ if drive_id:
+ url = f"{GRAPH_BASE}/drives/{drive_id}/items/{item_id}/content"
+ elif user_id and user_id != "me":
+ url = f"{GRAPH_BASE}/users/{user_id}/drive/items/{item_id}/content"
+ else:
+ url = f"{GRAPH_BASE}/me/drive/items/{item_id}/content"
+
+ for attempt in range(self._MAX_RETRIES):
+ try:
+ r = _requests.put(url, headers={**self._headers(),
+ "Content-Type": "application/octet-stream"},
+ data=content, timeout=self._TIMEOUT_BYTES)
+ except self._RETRYABLE_ERRORS:
+ if attempt == self._MAX_RETRIES - 1:
+ raise
+ self._backoff_sleep(attempt)
+ continue
+
+ if r.status_code == 429:
+ self._backoff_sleep(attempt, float(r.headers.get("Retry-After", 5)))
+ continue
+ if r.status_code in (503, 504):
+ if attempt < self._MAX_RETRIES - 1:
+ self._backoff_sleep(attempt)
+ continue
+ if r.status_code == 401 and attempt == 0:
+ self._token = None
+ if self.try_silent_auth():
+ self.put_drive_item_content(drive_id, item_id, content, user_id)
+ return
+ if r.status_code == 403:
+ try:
+ msg = r.json().get("error", {}).get("message", "")
+ except Exception:
+ msg = r.text[:200]
+ raise M365PermissionError(url, msg)
+ r.raise_for_status()
+ return
+ raise _requests.exceptions.RetryError(f"Gave up after {self._MAX_RETRIES} attempts: {url}")
+
# ── Teams ─────────────────────────────────────────────────────────────────
def list_all_teams(self) -> list:
diff --git a/routes/export.py b/routes/export.py
index 1dbcbf4..446f151 100644
--- a/routes/export.py
+++ b/routes/export.py
@@ -1205,12 +1205,23 @@ def delete_item():
return jsonify({"ok": False, "error": str(e)})
-_REDACT_EXTS = {".docx", ".xlsx", ".csv", ".txt"}
+_REDACT_EXTS = {".docx", ".xlsx", ".csv", ".txt", ".pdf"}
+
+
+_M365_CLOUD_TYPES = {"onedrive", "sharepoint", "teams"}
+
+_GDRIVE_MIME_MAP = {
+ ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ ".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ ".pdf": "application/pdf",
+}
+
+_ALL_REDACTABLE_TYPES = {"local", "smb", "sftp", "gdrive"} | _M365_CLOUD_TYPES
@bp.route("/api/redact_item", methods=["POST"])
def redact_item():
- """Redact CPR numbers in-place in a local file. Returns {ok, redacted}."""
+ """Redact CPR numbers in-place in a local, SMB, SFTP, M365, or Google Drive file."""
from pathlib import Path as _Path
import tempfile as _tempfile
import shutil as _shutil
@@ -1233,77 +1244,461 @@ def redact_item():
item_meta = {}
source_type = item_meta.get("source_type", "")
- if source_type not in ("local",):
- return jsonify({"ok": False, "error": "Redaction is only supported for local files"}), 400
+ is_m365_cloud = source_type in _M365_CLOUD_TYPES
+ if source_type not in _ALL_REDACTABLE_TYPES:
+ return jsonify({"ok": False, "error": "Redaction is only supported for local, SMB, SFTP, M365, and Google Drive files"}), 400
- full_path = item_meta.get("full_path", "")
- if not full_path:
- return jsonify({"ok": False, "error": "File path not available — rescan to enable redaction"}), 400
+ # --- local path branch ---
+ if source_type == "local":
+ full_path = item_meta.get("full_path", "")
+ if not full_path:
+ return jsonify({"ok": False, "error": "File path not available — rescan to enable redaction"}), 400
- path = _Path(full_path).expanduser()
- if not path.exists():
- return jsonify({"ok": False, "error": f"File not found: {full_path}"}), 404
+ path = _Path(full_path).expanduser()
+ if not path.exists():
+ return jsonify({"ok": False, "error": f"File not found: {full_path}"}), 404
- ext = path.suffix.lower()
- if ext not in _REDACT_EXTS:
- return jsonify({"ok": False, "error": f"Redaction not supported for {ext or 'this'} files. Supported: DOCX, XLSX, CSV, TXT"}), 400
+ ext = path.suffix.lower()
+ if ext not in _REDACT_EXTS:
+ return jsonify({"ok": False, "error": f"Redaction not supported for {ext or 'this'} files. Supported: DOCX, XLSX, CSV, TXT, PDF"}), 400
- tmp_path = None
- try:
- from document_scanner import (
- scan_docx, redact_docx,
- scan_xlsx, redact_xlsx,
- redact_csv,
- find_pii_spans_in_text,
- )
-
- with _tempfile.NamedTemporaryFile(suffix=ext, delete=False, dir=path.parent) as tmp:
- tmp_path = _Path(tmp.name)
-
- if ext == ".docx":
- results = scan_docx(path)
- redacted = redact_docx(path, tmp_path, results, use_ner=False)
- elif ext == ".xlsx":
- results = scan_xlsx(path)
- redacted = redact_xlsx(path, tmp_path, results, use_ner=False)
- elif ext == ".csv":
- redacted = redact_csv(path, tmp_path, use_ner=False)
- else: # .txt
- text = path.read_text(encoding="utf-8", errors="replace")
- spans = [(s, e, l) for s, e, l in find_pii_spans_in_text(text, use_ner=False) if l == "CPR"]
- chars = list(text)
- for s, e, _ in sorted(spans, reverse=True):
- chars[s:e] = ["█"] * (e - s)
- tmp_path.write_text("".join(chars), encoding="utf-8")
- redacted = len(spans)
-
- _shutil.move(str(tmp_path), str(path))
tmp_path = None
+ try:
+ from document_scanner import (
+ scan_docx, redact_docx,
+ scan_xlsx, redact_xlsx,
+ redact_csv,
+ scan_pdf, redact_pdf_secure,
+ find_pii_spans_in_text,
+ )
- state.flagged_items[:] = [x for x in state.flagged_items if x.get("id") != item_id]
- _db = _get_db() if DB_OK else None
- if _db:
+ with _tempfile.NamedTemporaryFile(suffix=ext, delete=False, dir=path.parent) as tmp:
+ tmp_path = _Path(tmp.name)
+
+ if ext == ".docx":
+ results = scan_docx(path)
+ redacted = redact_docx(path, tmp_path, results, use_ner=False)
+ elif ext == ".xlsx":
+ results = scan_xlsx(path)
+ redacted = redact_xlsx(path, tmp_path, results, use_ner=False)
+ elif ext == ".csv":
+ redacted = redact_csv(path, tmp_path, use_ner=False)
+ elif ext == ".pdf":
+ results = scan_pdf(path)
+ redacted = redact_pdf_secure(path, tmp_path, results,
+ force_ocr=False, lang="dan+eng",
+ dpi=200, poppler_path=None,
+ use_ner=False)
+ if redacted is False:
+ raise RuntimeError("PDF redaction failed — PyMuPDF and reportlab both unavailable. Install with: pip install pymupdf")
+ else: # .txt
+ text = path.read_text(encoding="utf-8", errors="replace")
+ spans = [(s, e, l) for s, e, l in find_pii_spans_in_text(text, use_ner=False) if l == "CPR"]
+ chars = list(text)
+ for s, e, _ in sorted(spans, reverse=True):
+ chars[s:e] = ["█"] * (e - s)
+ tmp_path.write_text("".join(chars), encoding="utf-8")
+ redacted = len(spans)
+
+ _shutil.move(str(tmp_path), str(path))
+ tmp_path = None
+
+ except Exception as exc:
+ if tmp_path and tmp_path.exists():
+ try:
+ tmp_path.unlink()
+ except Exception:
+ pass
+ logger.exception("[redact] local file error")
+ return jsonify({"ok": False, "error": str(exc)}), 500
+
+ # --- M365 cloud branch (OneDrive / SharePoint / Teams) ---
+ elif is_m365_cloud:
+ conn = state.connector
+ if conn is None:
+ return jsonify({"ok": False, "error": "M365 not connected — cannot redact cloud files"}), 400
+
+ name = item_meta.get("name", "")
+ ext = _Path(name).suffix.lower() if name else ""
+ if ext not in _REDACT_EXTS - {".csv", ".txt"}:
+ return jsonify({"ok": False, "error": f"Redaction not supported for {ext or 'this'} cloud files. Supported: DOCX, XLSX, PDF"}), 400
+
+ drive_id = item_meta.get("drive_id") or item_meta.get("_drive_id", "")
+ account_id = item_meta.get("account_id") or item_meta.get("_account_id", "")
+
+ tmp_path = None
+ try:
+ # Download
+ if drive_id:
+ raw = conn.download_sharepoint_item(drive_id, item_id)
+ elif account_id and account_id != "me":
+ raw = conn.download_drive_item_for(account_id, item_id)
+ else:
+ raw = conn.download_drive_item(item_id)
+
+ from document_scanner import (
+ scan_docx, redact_docx,
+ scan_xlsx, redact_xlsx,
+ scan_pdf, redact_pdf_secure,
+ )
+
+ with _tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
+ tmp.write(raw)
+ tmp_path = _Path(tmp.name)
+ del raw
+
+ with _tempfile.NamedTemporaryFile(suffix=ext, delete=False) as out:
+ out_path = _Path(out.name)
+
+ if ext == ".docx":
+ results = scan_docx(tmp_path)
+ redacted = redact_docx(tmp_path, out_path, results, use_ner=False)
+ elif ext == ".xlsx":
+ results = scan_xlsx(tmp_path)
+ redacted = redact_xlsx(tmp_path, out_path, results, use_ner=False)
+ else: # .pdf
+ results = scan_pdf(tmp_path)
+ redacted = redact_pdf_secure(tmp_path, out_path, results,
+ force_ocr=False, lang="dan+eng",
+ dpi=200, poppler_path=None,
+ use_ner=False)
+ if redacted is False:
+ raise RuntimeError("PDF redaction failed — PyMuPDF and reportlab both unavailable. Install with: pip install pymupdf")
+
+ # Upload redacted bytes back
+ redacted_bytes = out_path.read_bytes()
+ conn.put_drive_item_content(drive_id, item_id, redacted_bytes, user_id=account_id)
+ del redacted_bytes
+
+ except Exception as exc:
+ logger.exception("[redact] cloud file error")
+ return jsonify({"ok": False, "error": str(exc)}), 500
+ finally:
+ for p in ("tmp_path", "out_path"):
+ _p = locals().get(p)
+ if _p and _p.exists():
+ try:
+ _p.unlink()
+ except Exception:
+ pass
+
+ # --- Google Drive branch ---
+ elif source_type == "gdrive":
+ gconn = state.google_connector
+ if gconn is None:
+ return jsonify({"ok": False, "error": "Google not connected — cannot redact Drive files"}), 400
+
+ name = item_meta.get("name", "")
+ ext = _Path(name).suffix.lower() if name else ""
+ if ext not in _GDRIVE_MIME_MAP:
+ return jsonify({"ok": False, "error": f"Redaction not supported for {ext or 'this'} Drive files. Supported: DOCX, XLSX, PDF"}), 400
+
+ # item_id is "gdrive:{file_id}"
+ gfile_id = item_id[len("gdrive:"):] if item_id.startswith("gdrive:") else item_id
+ user_email = item_meta.get("account_id") or item_meta.get("_account_id", "")
+
+ tmp_path = out_path = None
+ try:
+ from document_scanner import (
+ scan_docx, redact_docx,
+ scan_xlsx, redact_xlsx,
+ scan_pdf, redact_pdf_secure,
+ )
+ from google_connector import GoogleError as _GoogleError
+
+ # Refuse Google-native formats (Docs/Sheets exported as DOCX)
try:
- _db.log_deletion(item_meta, reason="redacted")
- _db.delete_item_record(item_id)
- except Exception:
- pass
+ mime = gconn.get_drive_file_mime(user_email, gfile_id)
+ except Exception as exc:
+ return jsonify({"ok": False, "error": f"Could not read Drive file info: {exc}"}), 500
+ if mime.startswith("application/vnd.google-apps."):
+ return jsonify({"ok": False, "error": (
+ "Cannot redact a Google Docs/Sheets/Slides file in-place. "
+ "Export it as DOCX/XLSX/PDF first, then redact the exported copy."
+ )}), 400
- _audit("item_redact",
- f"id={item_id!r} name={item_meta.get('name','')!r} spans={redacted}",
- ip=request.remote_addr or "")
- logger.info("[redact] %s — %d CPR span(s) redacted", path.name, redacted)
- return jsonify({"ok": True, "redacted": redacted})
+ raw = gconn.download_drive_file_by_id(user_email, gfile_id)
- except Exception as e:
- logger.error("[redact] failed: %s", e)
- return jsonify({"ok": False, "error": str(e)})
- finally:
- if tmp_path and tmp_path.exists():
+ with _tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
+ tmp.write(raw)
+ tmp_path = _Path(tmp.name)
+ del raw
+
+ with _tempfile.NamedTemporaryFile(suffix=ext, delete=False) as out:
+ out_path = _Path(out.name)
+
+ if ext == ".docx":
+ results = scan_docx(tmp_path)
+ redacted = redact_docx(tmp_path, out_path, results, use_ner=False)
+ elif ext == ".xlsx":
+ results = scan_xlsx(tmp_path)
+ redacted = redact_xlsx(tmp_path, out_path, results, use_ner=False)
+ else: # .pdf
+ results = scan_pdf(tmp_path)
+ redacted = redact_pdf_secure(tmp_path, out_path, results,
+ force_ocr=False, lang="dan+eng",
+ dpi=200, poppler_path=None,
+ use_ner=False)
+ if redacted is False:
+ raise RuntimeError("PDF redaction failed — PyMuPDF and reportlab both unavailable. Install with: pip install pymupdf")
+
+ redacted_bytes = out_path.read_bytes()
+ gconn.update_drive_file(user_email, gfile_id, redacted_bytes, _GDRIVE_MIME_MAP[ext])
+ del redacted_bytes
+
+ except Exception as exc:
+ logger.exception("[redact] gdrive file error")
+ return jsonify({"ok": False, "error": str(exc)}), 500
+ finally:
+ for _p in (tmp_path, out_path):
+ if _p and _p.exists():
+ try:
+ _p.unlink()
+ except Exception:
+ pass
+
+ # --- SFTP branch ---
+ elif source_type == "sftp":
+ full_path = item_meta.get("full_path", "")
+ source_uri = item_meta.get("account_name", "") # sftp://user@host/root_path
+ if not full_path:
+ return jsonify({"ok": False, "error": "File path not available — rescan to enable SFTP redaction"}), 400
+ if not source_uri:
+ return jsonify({"ok": False, "error": "SFTP source info not in memory — rescan and redact in the same session"}), 400
+
+ ext = _Path(full_path).suffix.lower()
+ if ext not in _REDACT_EXTS:
+ return jsonify({"ok": False, "error": f"Redaction not supported for {ext or 'this'} files. Supported: DOCX, XLSX, CSV, TXT, PDF"}), 400
+
+ # Parse sftp://user@host/root to find matching source config
+ try:
+ from urllib.parse import urlparse as _urlparse
+ _u = _urlparse(source_uri)
+ _sftp_host = _u.hostname or ""
+ _sftp_user = _u.username or ""
+ except Exception:
+ _sftp_host = _sftp_user = ""
+
+ from app_config import _load_file_sources, _resolve_sftp_credentials
+ _sftp_source = next(
+ (s for s in _load_file_sources()
+ if s.get("source_type") == "sftp"
+ and s.get("sftp_host", "") == _sftp_host
+ and s.get("sftp_user", "") == _sftp_user),
+ None,
+ )
+ if _sftp_source is None:
+ return jsonify({"ok": False, "error": f"SFTP source config not found for {_sftp_host} — rescan to enable redaction"}), 400
+
+ _sftp_source = _resolve_sftp_credentials(_sftp_source)
+
+ tmp_path = out_path = None
+ try:
+ from sftp_connector import SFTPScanner as _SFTPScanner
+ from document_scanner import (
+ scan_docx, redact_docx,
+ scan_xlsx, redact_xlsx,
+ redact_csv,
+ scan_pdf, redact_pdf_secure,
+ find_pii_spans_in_text,
+ )
+
+ _sftp = _SFTPScanner(
+ host=_sftp_source.get("sftp_host", ""),
+ root_path=_sftp_source.get("path", "/"),
+ username=_sftp_source.get("sftp_user", ""),
+ port=int(_sftp_source.get("sftp_port", 22)),
+ auth_type=_sftp_source.get("sftp_auth", "password"),
+ password=_sftp_source.get("sftp_password") or None,
+ key_path=_sftp_source.get("sftp_key_path") or None,
+ passphrase=_sftp_source.get("sftp_passphrase") or None,
+ )
+
+ raw = _sftp.read_file(full_path)
+
+ with _tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
+ tmp.write(raw)
+ tmp_path = _Path(tmp.name)
+ del raw
+
+ with _tempfile.NamedTemporaryFile(suffix=ext, delete=False) as out:
+ out_path = _Path(out.name)
+
+ if ext == ".docx":
+ results = scan_docx(tmp_path)
+ redacted = redact_docx(tmp_path, out_path, results, use_ner=False)
+ elif ext == ".xlsx":
+ results = scan_xlsx(tmp_path)
+ redacted = redact_xlsx(tmp_path, out_path, results, use_ner=False)
+ elif ext == ".csv":
+ redacted = redact_csv(tmp_path, out_path, use_ner=False)
+ elif ext == ".pdf":
+ results = scan_pdf(tmp_path)
+ redacted = redact_pdf_secure(tmp_path, out_path, results,
+ force_ocr=False, lang="dan+eng",
+ dpi=200, poppler_path=None,
+ use_ner=False)
+ if redacted is False:
+ raise RuntimeError("PDF redaction failed — install PyMuPDF: pip install pymupdf")
+ else: # .txt
+ text = tmp_path.read_text(encoding="utf-8", errors="replace")
+ spans = [(s, e, l) for s, e, l in find_pii_spans_in_text(text, use_ner=False) if l == "CPR"]
+ chars = list(text)
+ for s, e, _ in sorted(spans, reverse=True):
+ chars[s:e] = ["█"] * (e - s)
+ out_path.write_text("".join(chars), encoding="utf-8")
+ redacted = len(spans)
+
+ _sftp.write_file(full_path, out_path.read_bytes())
+
+ except Exception as exc:
+ logger.exception("[redact] sftp file error")
+ return jsonify({"ok": False, "error": str(exc)}), 500
+ finally:
+ for _p in (tmp_path, out_path):
+ if _p and _p.exists():
+ try:
+ _p.unlink()
+ except Exception:
+ pass
+
+ # --- SMB branch ---
+ elif source_type == "smb":
+ full_path = item_meta.get("full_path", "")
+ if not full_path:
+ return jsonify({"ok": False, "error": "File path not available — rescan to enable SMB redaction"}), 400
+
+ ext = _Path(full_path.replace("\\", "/").split("/")[-1]).suffix.lower()
+ if ext not in _REDACT_EXTS:
+ return jsonify({"ok": False, "error": f"Redaction not supported for {ext or 'this'} files. Supported: DOCX, XLSX, CSV, TXT, PDF"}), 400
+
+ # Parse //host/share/... to find matching source config
+ _norm = full_path.replace("\\", "/").lstrip("/")
+ _parts = _norm.split("/", 2)
+ _smb_host_fp = _parts[0] if len(_parts) > 0 else ""
+
+ from app_config import _load_file_sources
+ from file_scanner import get_smb_password as _get_smb_pw
+ _smb_source = next(
+ (s for s in _load_file_sources()
+ if s.get("source_type", "smb") in ("smb", "")
+ and (s.get("smb_host", "") == _smb_host_fp
+ or s.get("path", "").replace("\\", "/").lstrip("/").split("/")[0] == _smb_host_fp)),
+ None,
+ )
+ if _smb_source is None:
+ return jsonify({"ok": False, "error": f"SMB source config not found for {_smb_host_fp}"}), 400
+
+ _smb_user = _smb_source.get("smb_user", "")
+ _smb_domain = _smb_source.get("smb_domain", "")
+ _smb_kc = _smb_source.get("keychain_key") or None
+ _smb_pw = _smb_source.get("smb_password") or _get_smb_pw(_smb_host_fp, _smb_user, _smb_kc) or ""
+
+ tmp_path = out_path = None
+ try:
+ from file_scanner import write_smb_file as _write_smb
+ from document_scanner import (
+ scan_docx, redact_docx,
+ scan_xlsx, redact_xlsx,
+ redact_csv,
+ scan_pdf, redact_pdf_secure,
+ find_pii_spans_in_text,
+ )
+
+ # Download current content
+ from file_scanner import _smb_read_file as _smb_read, SMB_OK as _SMB_OK
+ if not _SMB_OK:
+ raise RuntimeError("smbprotocol not installed — run: pip install smbprotocol")
+
+ import uuid as _uuid
+ from smbprotocol.connection import Connection as _SmbConn
+ from smbprotocol.session import Session as _SmbSession
+ from smbprotocol.tree import TreeConnect as _SmbTree
+ _norm2 = full_path.replace("\\", "/").lstrip("/")
+ _fp = _norm2.split("/", 2)
+ _fhost = _fp[0]; _fshare = _fp[1] if len(_fp) > 1 else ""
+ _frel = (_fp[2].replace("/", "\\")) if len(_fp) > 2 else ""
+
+ _smb_conn = _SmbConn(_uuid.uuid4(), _fhost, 445)
+ _smb_conn.connect(timeout=30)
try:
- tmp_path.unlink()
- except Exception:
- pass
+ _smb_sess = _SmbSession(_smb_conn,
+ username=f"{_smb_domain}\\{_smb_user}" if _smb_domain else _smb_user,
+ password=_smb_pw, require_encryption=False)
+ _smb_sess.connect()
+ try:
+ _smb_tree = _SmbTree(_smb_sess, f"\\\\{_fhost}\\{_fshare}")
+ _smb_tree.connect()
+ try:
+ raw = _smb_read(_smb_tree, _frel)
+ finally:
+ _smb_tree.disconnect()
+ finally:
+ _smb_sess.disconnect()
+ finally:
+ _smb_conn.disconnect()
+
+ with _tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
+ tmp.write(raw)
+ tmp_path = _Path(tmp.name)
+ del raw
+
+ with _tempfile.NamedTemporaryFile(suffix=ext, delete=False) as out:
+ out_path = _Path(out.name)
+
+ if ext == ".docx":
+ results = scan_docx(tmp_path)
+ redacted = redact_docx(tmp_path, out_path, results, use_ner=False)
+ elif ext == ".xlsx":
+ results = scan_xlsx(tmp_path)
+ redacted = redact_xlsx(tmp_path, out_path, results, use_ner=False)
+ elif ext == ".csv":
+ redacted = redact_csv(tmp_path, out_path, use_ner=False)
+ elif ext == ".pdf":
+ results = scan_pdf(tmp_path)
+ redacted = redact_pdf_secure(tmp_path, out_path, results,
+ force_ocr=False, lang="dan+eng",
+ dpi=200, poppler_path=None,
+ use_ner=False)
+ if redacted is False:
+ raise RuntimeError("PDF redaction failed — install PyMuPDF: pip install pymupdf")
+ else: # .txt
+ text = tmp_path.read_text(encoding="utf-8", errors="replace")
+ spans = [(s, e, l) for s, e, l in find_pii_spans_in_text(text, use_ner=False) if l == "CPR"]
+ chars = list(text)
+ for s, e, _ in sorted(spans, reverse=True):
+ chars[s:e] = ["█"] * (e - s)
+ out_path.write_text("".join(chars), encoding="utf-8")
+ redacted = len(spans)
+
+ _write_smb(full_path, out_path.read_bytes(), _smb_user, _smb_pw, _smb_domain)
+
+ except Exception as exc:
+ logger.exception("[redact] smb file error")
+ return jsonify({"ok": False, "error": str(exc)}), 500
+ finally:
+ for _p in (tmp_path, out_path):
+ if _p and _p.exists():
+ try:
+ _p.unlink()
+ except Exception:
+ pass
+
+ # --- shared: remove from grid + DB ---
+ state.flagged_items[:] = [x for x in state.flagged_items if x.get("id") != item_id]
+ _db = _get_db() if DB_OK else None
+ if _db:
+ try:
+ _db.log_deletion(item_meta, reason="redacted")
+ _db.delete_item_record(item_id)
+ except Exception:
+ pass
+
+ _audit("item_redact",
+ f"id={item_id!r} name={item_meta.get('name','')!r} spans={redacted}",
+ ip=request.remote_addr or "")
+ logger.info("[redact] %s — %d CPR span(s) redacted", item_meta.get('name', item_id), redacted)
+ return jsonify({"ok": True, "redacted": redacted})
@bp.route("/api/delete_bulk", methods=["POST"])
diff --git a/sftp_connector.py b/sftp_connector.py
index 1b1a6d7..e46464e 100644
--- a/sftp_connector.py
+++ b/sftp_connector.py
@@ -154,6 +154,53 @@ class SFTPScanner:
finally:
ssh.close()
+ def _ssh_connect(self):
+ """Return a connected paramiko SSHClient. Caller must call .close()."""
+ if not SFTP_OK:
+ raise RuntimeError("paramiko not installed — run: pip install paramiko")
+ ssh = paramiko.SSHClient()
+ ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+ kw: dict = {
+ "hostname": self.host,
+ "port": self.port,
+ "username": self.username,
+ "timeout": 30,
+ }
+ if self.auth_type == "key" and self.key_path:
+ kw["pkey"] = _load_pkey(self.key_path, self._passphrase)
+ else:
+ kw["password"] = self._password or ""
+ kw["look_for_keys"] = False
+ kw["allow_agent"] = False
+ ssh.connect(**kw)
+ return ssh
+
+ def read_file(self, remote_path: str) -> bytes:
+ """Download and return the raw bytes of a single remote file."""
+ ssh = self._ssh_connect()
+ try:
+ sftp = ssh.open_sftp()
+ try:
+ with sftp.open(remote_path, "rb") as fh:
+ return fh.read()
+ finally:
+ sftp.close()
+ finally:
+ ssh.close()
+
+ def write_file(self, remote_path: str, content: bytes) -> None:
+ """Write content to remote_path on the SFTP server, overwriting if it exists."""
+ ssh = self._ssh_connect()
+ try:
+ sftp = ssh.open_sftp()
+ try:
+ with sftp.open(remote_path, "wb") as fh:
+ fh.write(content)
+ finally:
+ sftp.close()
+ finally:
+ ssh.close()
+
# ── Private walker ────────────────────────────────────────────────────────
def _walk(
diff --git a/static/js/results.js b/static/js/results.js
index c0f0cca..a6b52aa 100644
--- a/static/js/results.js
+++ b/static/js/results.js
@@ -36,9 +36,16 @@ function appendCard(f) {
card.appendChild(cb);
const delBtn = (window.VIEWER_MODE || f._resolved) ? '' : ``;
- const _redactExts = new Set(['.docx', '.xlsx', '.txt', '.csv']);
- const _redactable = !window.VIEWER_MODE && !f._resolved && f.source_type === 'local' && f.cpr_count > 0
- && _redactExts.has((f.name || '').substring((f.name || '').lastIndexOf('.')).toLowerCase());
+ const _redactExts = new Set(['.docx', '.xlsx', '.txt', '.csv', '.pdf']);
+ const _cloudRedactExts = new Set(['.docx', '.xlsx', '.pdf']);
+ const _m365Types = new Set(['onedrive', 'sharepoint', 'teams']);
+ const _fileExt = (f.name || '').substring((f.name || '').lastIndexOf('.')).toLowerCase();
+ const _redactable = !window.VIEWER_MODE && !f._resolved && f.cpr_count > 0 && (
+ f.source_type === 'local' ? _redactExts.has(_fileExt) :
+ _m365Types.has(f.source_type) ? _cloudRedactExts.has(_fileExt) :
+ f.source_type === 'gdrive' ? _cloudRedactExts.has(_fileExt) :
+ (f.source_type === 'smb' || f.source_type === 'sftp') ? _redactExts.has(_fileExt) : false
+ );
const redactBtn = _redactable ? `` : '';
if (S.isListView) {