From 360eb1caedfd2c8352877a747828dbe3e5a3967a Mon Sep 17 00:00:00 2001
From: StyxX65 <150797939+StyxX65@users.noreply.github.com>
Date: Tue, 21 Apr 2026 21:42:54 +0200
Subject: [PATCH] Bugfixes in media detection

---
 CHANGELOG.md    |  2 ++
 CLAUDE.md       |  2 ++
 README.md       |  4 ++--
 file_scanner.py | 21 ++++++---------------
 4 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 858bf48..93e2046 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,6 +19,8 @@ Version numbers follow [Semantic Versioning](https://semver.org/spec/v2.0.0.html
 
 ### Fixed
 
+- **Audio and video files not appearing in local/SMB file scan** — `file_scanner.py` maintained its own hardcoded `DEFAULT_EXTENSIONS` set that was never updated when video and audio extensions were added to `cpr_detector.SUPPORTED_EXTS`. Fixed by importing `SUPPORTED_EXTS` from `cpr_detector` directly; `DEFAULT_EXTENSIONS` is now an alias for it. `cpr_detector.SUPPORTED_EXTS` is the single source of truth for all scan sources (M365, Google Drive, local, SMB).
+
 - **Profile copy rename not reflected in left column until modal reopen** — saving a renamed profile via the full editor (`_pmgmtSaveFullEdit`) called `loadProfiles()` to refresh `S._profiles` but never called `_renderProfileMgmt()`, so the left-column list was not repainted. The new name only appeared after closing and reopening the modal. Fixed by calling `_renderProfileMgmt()` immediately after `loadProfiles()` and re-applying the `.active` highlight to the correct row. 10 new route integration tests added for all profile API endpoints; total test count: 182.
 
 ---
diff --git a/CLAUDE.md b/CLAUDE.md
index 166f2dc..ad87881 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -20,6 +20,8 @@ python -m pytest tests/ -q
 
 **Shared content processing** — all three scan engines (M365, Google, file) funnel downloaded bytes through a single function: `cpr_detector._scan_bytes(content, filename)`. It dispatches to the correct parser by file extension. `scan_engine.py` uses the `_scan_bytes_timeout` wrapper for PDFs (subprocess + hard timeout). `routes/google_scan.py` uses `_scan_bytes` directly. Do not duplicate file-type handling in per-source code.
 
+**`cpr_detector.SUPPORTED_EXTS` is the single source of truth** for which file extensions are scanned across all sources. `file_scanner.py` imports it as `DEFAULT_EXTENSIONS` so local/SMB scans stay in sync automatically. `scan_engine.py` uses it to gate M365/SharePoint/Teams file downloads. Do not maintain a separate extension list anywhere else.
+
 **`_scan_bytes` injection pattern** — `scan_engine.py` defines a no-op stub for `_scan_bytes` / `_scan_bytes_timeout` at module level (avoids circular import). `gdpr_scanner.py` overwrites them with the real `cpr_detector` implementations at startup. `routes/google_scan.py` resolves them lazily via `gdpr_scanner.__getattr__`. This is intentional — do not try to import them directly in those modules.
 
 **Blueprints** in `routes/` — see `routes/CLAUDE.md` for state/SSE rules.
diff --git a/README.md b/README.md
index b86abf8..eebe70e 100644
--- a/README.md
+++ b/README.md
@@ -658,12 +658,12 @@ See [SUGGESTIONS.md](SUGGESTIONS.md) for the full feature roadmap with implement
 | `app_config.py` | All persistence — profiles, settings, SMTP config, lang loading, Fernet encryption |
 | `sse.py` | SSE broadcast queue and `_current_scan_id` |
 | `checkpoint.py` | Mid-scan checkpoint save/load, `_checkpoint_key()` |
-| `cpr_detector.py` | CPR pattern matching and validation |
+| `cpr_detector.py` | CPR pattern matching and validation. Defines `SUPPORTED_EXTS` — the single source of truth for which file extensions are scanned across all sources (M365, Google Drive, local/SMB). Also contains `VIDEO_EXTS` and `AUDIO_EXTS` subsets and the metadata extractors `_extract_video_metadata` / `_extract_audio_metadata`. |
 | `document_scanner.py` | Core scanning, redaction, OCR, NER, and PII detection engine |
 | `gdpr_db.py` | SQLite persistence layer — scan results, CPR index, PII hits, dispositions, scan history |
 | `m365_connector.py` | Microsoft Graph API client — auth, token refresh, email/OneDrive/SharePoint/Teams fetchers, delete methods |
 | `google_connector.py` | Google Workspace API client — Gmail, Drive, Admin SDK |
-| `file_scanner.py` | Unified local + SMB/CIFS file iterator — `FileScanner.iter_files()` yields `(path, bytes, metadata)`. SMB reads use a 1-slot sliding-window `ThreadPoolExecutor` (`PREFETCH_WINDOW=1`) with a 60-second per-file timeout. |
+| `file_scanner.py` | Unified local + SMB/CIFS file iterator — `FileScanner.iter_files()` yields `(path, bytes, metadata)`. SMB reads use a 1-slot sliding-window `ThreadPoolExecutor` (`PREFETCH_WINDOW=1`) with a 60-second per-file timeout. `DEFAULT_EXTENSIONS` is imported from `cpr_detector.SUPPORTED_EXTS` (not a local hardcoded set) so the scannable extension list stays in sync automatically. |
 | `scan_scheduler.py` | In-process APScheduler wrapper — multi-job scheduled scan engine |
 | `templates/index.html` | Single-page HTML shell — Jinja2 template. Two variables: `app_version`, `lang_json`. |
 | `static/style.css` | All application CSS — custom properties, layout, components, light/dark themes |
diff --git a/file_scanner.py b/file_scanner.py
index e8fdfea..497021a 100644
--- a/file_scanner.py
+++ b/file_scanner.py
@@ -24,6 +24,8 @@ import hashlib
 from pathlib import Path, PurePosixPath
 from typing import Iterator
 
+from cpr_detector import SUPPORTED_EXTS as DEFAULT_EXTENSIONS
+
 # ── Optional dependency flags ─────────────────────────────────────────────────
 
 try:
@@ -58,19 +60,8 @@ except ImportError:
 
 KEYCHAIN_SERVICE = "gdpr-scanner-nas"
 
-# File extensions passed through to _scan_bytes().  Matches SUPPORTED_EXTS in
-# gdpr_scanner.py; kept here too so FileScanner can filter without importing it.
-DEFAULT_EXTENSIONS = {
-    ".pdf", ".docx", ".doc", ".xlsx", ".xlsm", ".csv",
-    ".txt", ".eml", ".msg",
-    ".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp",
-    ".heic", ".heif",
-}
-
-# Extensions for local/SMB file scans — PDFs now included; OCR runs in a spawned
-# subprocess with a 60-second hard timeout via _scan_bytes_timeout so hanging
-# Tesseract/Poppler processes can never block the scan thread indefinitely.
-FILE_SCAN_EXTENSIONS = DEFAULT_EXTENSIONS
+# DEFAULT_EXTENSIONS is imported from cpr_detector.SUPPORTED_EXTS — single source of truth.
+# Adding a new file type to cpr_detector.py automatically extends local/SMB scans too.
 
 # Maximum file size to load into memory (bytes).  Files larger than this are
 # skipped with a warning — same guard used by the M365 attachment scanner.
@@ -147,7 +138,7 @@ def store_smb_password(smb_host: str, smb_user: str,
 class FileScanner:
     """Unified local + SMB/CIFS file iterator."""
 
-    FILE_SCAN_EXTENSIONS = FILE_SCAN_EXTENSIONS  # excludes .pdf
+    FILE_SCAN_EXTENSIONS = DEFAULT_EXTENSIONS
     """Unified iterator over local paths and SMB/CIFS network shares.
 
     Usage::
@@ -209,7 +200,7 @@ class FileScanner:
 
         Args:
             extensions:  Set of lowercase extensions to include, e.g. {".pdf", ".docx"}.
-                         Defaults to DEFAULT_EXTENSIONS.
+                         Defaults to DEFAULT_EXTENSIONS (cpr_detector.SUPPORTED_EXTS).
             progress_cb: Optional callable(rel_path) called before each file is read,
                          so the caller can update a progress indicator.