From 360eb1caedfd2c8352877a747828dbe3e5a3967a Mon Sep 17 00:00:00 2001 From: StyxX65 <150797939+StyxX65@users.noreply.github.com> Date: Tue, 21 Apr 2026 21:42:54 +0200 Subject: [PATCH] Bugfixes in media detection --- CHANGELOG.md | 2 ++ CLAUDE.md | 2 ++ README.md | 4 ++-- file_scanner.py | 21 ++++++--------------- 4 files changed, 12 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 858bf48..93e2046 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,8 @@ Version numbers follow [Semantic Versioning](https://semver.org/spec/v2.0.0.html ### Fixed +- **Audio and video files not appearing in local/SMB file scan** — `file_scanner.py` maintained its own hardcoded `DEFAULT_EXTENSIONS` set that was never updated when video and audio extensions were added to `cpr_detector.SUPPORTED_EXTS`. Fixed by importing `SUPPORTED_EXTS` from `cpr_detector` directly; `DEFAULT_EXTENSIONS` is now an alias for it. `cpr_detector.SUPPORTED_EXTS` is the single source of truth for all scan sources (M365, Google Drive, local, SMB). + - **Profile copy rename not reflected in left column until modal reopen** — saving a renamed profile via the full editor (`_pmgmtSaveFullEdit`) called `loadProfiles()` to refresh `S._profiles` but never called `_renderProfileMgmt()`, so the left-column list was not repainted. The new name only appeared after closing and reopening the modal. Fixed by calling `_renderProfileMgmt()` immediately after `loadProfiles()` and re-applying the `.active` highlight to the correct row. 10 new route integration tests added for all profile API endpoints; total test count: 182. --- diff --git a/CLAUDE.md b/CLAUDE.md index 166f2dc..ad87881 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -20,6 +20,8 @@ python -m pytest tests/ -q **Shared content processing** — all three scan engines (M365, Google, file) funnel downloaded bytes through a single function: `cpr_detector._scan_bytes(content, filename)`. It dispatches to the correct parser by file extension. `scan_engine.py` uses the `_scan_bytes_timeout` wrapper for PDFs (subprocess + hard timeout). `routes/google_scan.py` uses `_scan_bytes` directly. Do not duplicate file-type handling in per-source code. +**`cpr_detector.SUPPORTED_EXTS` is the single source of truth** for which file extensions are scanned across all sources. `file_scanner.py` imports it as `DEFAULT_EXTENSIONS` so local/SMB scans stay in sync automatically. `scan_engine.py` uses it to gate M365/SharePoint/Teams file downloads. Do not maintain a separate extension list anywhere else. + **`_scan_bytes` injection pattern** — `scan_engine.py` defines a no-op stub for `_scan_bytes` / `_scan_bytes_timeout` at module level (avoids circular import). `gdpr_scanner.py` overwrites them with the real `cpr_detector` implementations at startup. `routes/google_scan.py` resolves them lazily via `gdpr_scanner.__getattr__`. This is intentional — do not try to import them directly in those modules. **Blueprints** in `routes/` — see `routes/CLAUDE.md` for state/SSE rules. diff --git a/README.md b/README.md index b86abf8..eebe70e 100644 --- a/README.md +++ b/README.md @@ -658,12 +658,12 @@ See [SUGGESTIONS.md](SUGGESTIONS.md) for the full feature roadmap with implement | `app_config.py` | All persistence — profiles, settings, SMTP config, lang loading, Fernet encryption | | `sse.py` | SSE broadcast queue and `_current_scan_id` | | `checkpoint.py` | Mid-scan checkpoint save/load, `_checkpoint_key()` | -| `cpr_detector.py` | CPR pattern matching and validation | +| `cpr_detector.py` | CPR pattern matching and validation. Defines `SUPPORTED_EXTS` — the single source of truth for which file extensions are scanned across all sources (M365, Google Drive, local/SMB). Also contains `VIDEO_EXTS` and `AUDIO_EXTS` subsets and the metadata extractors `_extract_video_metadata` / `_extract_audio_metadata`. | | `document_scanner.py` | Core scanning, redaction, OCR, NER, and PII detection engine | | `gdpr_db.py` | SQLite persistence layer — scan results, CPR index, PII hits, dispositions, scan history | | `m365_connector.py` | Microsoft Graph API client — auth, token refresh, email/OneDrive/SharePoint/Teams fetchers, delete methods | | `google_connector.py` | Google Workspace API client — Gmail, Drive, Admin SDK | -| `file_scanner.py` | Unified local + SMB/CIFS file iterator — `FileScanner.iter_files()` yields `(path, bytes, metadata)`. SMB reads use a 1-slot sliding-window `ThreadPoolExecutor` (`PREFETCH_WINDOW=1`) with a 60-second per-file timeout. | +| `file_scanner.py` | Unified local + SMB/CIFS file iterator — `FileScanner.iter_files()` yields `(path, bytes, metadata)`. SMB reads use a 1-slot sliding-window `ThreadPoolExecutor` (`PREFETCH_WINDOW=1`) with a 60-second per-file timeout. `DEFAULT_EXTENSIONS` is imported from `cpr_detector.SUPPORTED_EXTS` (not a local hardcoded set) so the scannable extension list stays in sync automatically. | | `scan_scheduler.py` | In-process APScheduler wrapper — multi-job scheduled scan engine | | `templates/index.html` | Single-page HTML shell — Jinja2 template. Two variables: `app_version`, `lang_json`. | | `static/style.css` | All application CSS — custom properties, layout, components, light/dark themes | diff --git a/file_scanner.py b/file_scanner.py index e8fdfea..497021a 100644 --- a/file_scanner.py +++ b/file_scanner.py @@ -24,6 +24,8 @@ import hashlib from pathlib import Path, PurePosixPath from typing import Iterator +from cpr_detector import SUPPORTED_EXTS as DEFAULT_EXTENSIONS + # ── Optional dependency flags ───────────────────────────────────────────────── try: @@ -58,19 +60,8 @@ except ImportError: KEYCHAIN_SERVICE = "gdpr-scanner-nas" -# File extensions passed through to _scan_bytes(). Matches SUPPORTED_EXTS in -# gdpr_scanner.py; kept here too so FileScanner can filter without importing it. -DEFAULT_EXTENSIONS = { - ".pdf", ".docx", ".doc", ".xlsx", ".xlsm", ".csv", - ".txt", ".eml", ".msg", - ".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", - ".heic", ".heif", -} - -# Extensions for local/SMB file scans — PDFs now included; OCR runs in a spawned -# subprocess with a 60-second hard timeout via _scan_bytes_timeout so hanging -# Tesseract/Poppler processes can never block the scan thread indefinitely. -FILE_SCAN_EXTENSIONS = DEFAULT_EXTENSIONS +# DEFAULT_EXTENSIONS is imported from cpr_detector.SUPPORTED_EXTS — single source of truth. +# Adding a new file type to cpr_detector.py automatically extends local/SMB scans too. # Maximum file size to load into memory (bytes). Files larger than this are # skipped with a warning — same guard used by the M365 attachment scanner. @@ -147,7 +138,7 @@ def store_smb_password(smb_host: str, smb_user: str, class FileScanner: """Unified local + SMB/CIFS file iterator.""" - FILE_SCAN_EXTENSIONS = FILE_SCAN_EXTENSIONS # excludes .pdf + FILE_SCAN_EXTENSIONS = DEFAULT_EXTENSIONS """Unified iterator over local paths and SMB/CIFS network shares. Usage:: @@ -209,7 +200,7 @@ class FileScanner: Args: extensions: Set of lowercase extensions to include, e.g. {".pdf", ".docx"}. - Defaults to DEFAULT_EXTENSIONS. + Defaults to DEFAULT_EXTENSIONS (cpr_detector.SUPPORTED_EXTS). progress_cb: Optional callable(rel_path) called before each file is read, so the caller can update a progress indicator.