From 9c7df76fbdc86bc0fc8348773eeefa8080709f78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20H=C3=B8jmark?= Date: Sat, 11 Apr 2026 04:38:11 +0200 Subject: [PATCH] Initial commit --- .github/workflows/build.yml | 168 ++ .gitignore | 91 ++ ACRONYMS.md | 58 + CHANGELOG.md | 2458 +++++++++++++++++++++++++++++ CLAUDE.md | 84 + CONTRIBUTING.md | 130 ++ DEPENDENCIES.md | 140 ++ EFFORT_ESTIMATE.md | 67 + LICENSE | 49 + MAINTAINER.md | 205 +++ README.md | 629 ++++++++ SECURITY.md | 73 + SUGGESTIONS.md | 1537 ++++++++++++++++++ TODO.md | 46 + VERSION | 1 + app_config.py | 794 ++++++++++ build_gdpr.py | 1095 +++++++++++++ build_gdpr.sh | 5 + checkpoint.py | 84 + cpr_detector.py | 446 ++++++ docs/manuals/MANUAL-DA.md | 543 +++++++ docs/manuals/MANUAL-EN.md | 543 +++++++ docs/setup/GOOGLE_SETUP.md | 144 ++ docs/setup/M365_SETUP.md | 160 ++ document_scanner.py | 2659 ++++++++++++++++++++++++++++++++ file_scanner.py | 600 +++++++ gdpr_db.py | 954 ++++++++++++ gdpr_scanner.py | 2212 ++++++++++++++++++++++++++ google_connector.py | 726 +++++++++ icon_gdpr.icns | Bin 0 -> 269162 bytes icon_gdpr.ico | Bin 0 -> 739 bytes icon_gdpr.png | Bin 0 -> 18572 bytes install_macos.sh | 423 +++++ install_windows.ps1 | 568 +++++++ keywords/da.json | 532 +++++++ lang/CLAUDE.md | 7 + lang/da.json | 773 ++++++++++ lang/de.json | 773 ++++++++++ lang/en.json | 773 ++++++++++ m365_connector.py | 1141 ++++++++++++++ m365_launcher.py | 446 ++++++ pytest.ini | 6 + requirements.txt | 48 + routes/CLAUDE.md | 21 + routes/__init__.py | 8 + routes/app_routes.py | 386 +++++ routes/auth.py | 179 +++ routes/database.py | 591 +++++++ routes/email.py | 303 ++++ routes/export.py | 1222 +++++++++++++++ routes/google_auth.py | 246 +++ routes/google_scan.py | 328 ++++ routes/profiles.py | 47 + routes/scan.py | 137 ++ routes/scheduler.py | 156 ++ routes/sources.py | 100 ++ routes/state.py | 41 + routes/users.py | 217 +++ routes/viewer.py | 152 ++ run_tests.sh | 14 + scan_engine.py | 1161 ++++++++++++++ scan_scheduler.py | 489 ++++++ skus/education.json | 50 + skus/google_ou_roles.json | 26 + sse.py | 54 + start_gdpr.sh | 5 + static/js/CLAUDE.md | 28 + static/js/auth.js | 198 +++ static/js/connector.js | 684 ++++++++ static/js/log.js | 341 ++++ static/js/profiles.js | 709 +++++++++ static/js/results.js | 886 +++++++++++ static/js/scan.js | 730 +++++++++ static/js/scheduler.js | 439 ++++++ static/js/sources.js | 269 ++++ static/js/state.js | 31 + static/js/ui.js | 120 ++ static/js/users.js | 475 ++++++ static/js/viewer.js | 225 +++ static/style.css | 616 ++++++++ templates/CLAUDE.md | 29 + templates/index.html | 1276 +++++++++++++++ templates/viewer_denied.html | 28 + templates/viewer_pin.html | 82 + tests/__init__.py | 0 tests/conftest.py | 113 ++ tests/test_app_config.py | 254 +++ tests/test_checkpoint.py | 147 ++ tests/test_db.py | 267 ++++ tests/test_document_scanner.py | 224 +++ tests/test_routes.py | 277 ++++ 91 files changed, 36572 insertions(+) create mode 100644 .github/workflows/build.yml create mode 100644 .gitignore create mode 100644 ACRONYMS.md create mode 100644 CHANGELOG.md create mode 100644 CLAUDE.md create mode 100644 CONTRIBUTING.md create mode 100644 DEPENDENCIES.md create mode 100644 EFFORT_ESTIMATE.md create mode 100644 LICENSE create mode 100644 MAINTAINER.md create mode 100644 README.md create mode 100644 SECURITY.md create mode 100644 SUGGESTIONS.md create mode 100644 TODO.md create mode 100644 VERSION create mode 100644 app_config.py create mode 100755 build_gdpr.py create mode 100755 build_gdpr.sh create mode 100644 checkpoint.py create mode 100644 cpr_detector.py create mode 100644 docs/manuals/MANUAL-DA.md create mode 100644 docs/manuals/MANUAL-EN.md create mode 100644 docs/setup/GOOGLE_SETUP.md create mode 100644 docs/setup/M365_SETUP.md create mode 100644 document_scanner.py create mode 100644 file_scanner.py create mode 100644 gdpr_db.py create mode 100644 gdpr_scanner.py create mode 100644 google_connector.py create mode 100644 icon_gdpr.icns create mode 100644 icon_gdpr.ico create mode 100644 icon_gdpr.png create mode 100755 install_macos.sh create mode 100644 install_windows.ps1 create mode 100644 keywords/da.json create mode 100644 lang/CLAUDE.md create mode 100644 lang/da.json create mode 100644 lang/de.json create mode 100644 lang/en.json create mode 100644 m365_connector.py create mode 100644 m365_launcher.py create mode 100644 pytest.ini create mode 100644 requirements.txt create mode 100644 routes/CLAUDE.md create mode 100644 routes/__init__.py create mode 100644 routes/app_routes.py create mode 100644 routes/auth.py create mode 100644 routes/database.py create mode 100644 routes/email.py create mode 100644 routes/export.py create mode 100644 routes/google_auth.py create mode 100644 routes/google_scan.py create mode 100644 routes/profiles.py create mode 100644 routes/scan.py create mode 100644 routes/scheduler.py create mode 100644 routes/sources.py create mode 100644 routes/state.py create mode 100644 routes/users.py create mode 100644 routes/viewer.py create mode 100755 run_tests.sh create mode 100644 scan_engine.py create mode 100644 scan_scheduler.py create mode 100644 skus/education.json create mode 100644 skus/google_ou_roles.json create mode 100644 sse.py create mode 100755 start_gdpr.sh create mode 100644 static/js/CLAUDE.md create mode 100644 static/js/auth.js create mode 100644 static/js/connector.js create mode 100644 static/js/log.js create mode 100644 static/js/profiles.js create mode 100644 static/js/results.js create mode 100644 static/js/scan.js create mode 100644 static/js/scheduler.js create mode 100644 static/js/sources.js create mode 100644 static/js/state.js create mode 100644 static/js/ui.js create mode 100644 static/js/users.js create mode 100644 static/js/viewer.js create mode 100644 static/style.css create mode 100644 templates/CLAUDE.md create mode 100644 templates/index.html create mode 100644 templates/viewer_denied.html create mode 100644 templates/viewer_pin.html create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/test_app_config.py create mode 100644 tests/test_checkpoint.py create mode 100644 tests/test_db.py create mode 100644 tests/test_document_scanner.py create mode 100644 tests/test_routes.py diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml new file mode 100644 index 0000000..8cfce70 --- /dev/null +++ b/.github/workflows/build.yml @@ -0,0 +1,168 @@ +name: Build — Windows & Linux + +# Trigger on every push to main, on version tags, or manually +on: + push: + branches: [main] + tags: ['v*'] + workflow_dispatch: + +# Only run one build at a time per branch to avoid race conditions +concurrency: + group: build-${{ github.ref }} + cancel-in-progress: true + +jobs: + + # ── Document Scanner ────────────────────────────────────────────────────── + build-document-scanner: + strategy: + fail-fast: false + matrix: + include: + - os: windows-latest + name: windows + artifact_glob: "dist/*.exe" + - os: ubuntu-22.04 + name: linux + artifact_glob: "dist/Document Scanner" + + runs-on: ${{ matrix.os }} + name: Document Scanner / ${{ matrix.name }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python 3.12 + uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + + # Linux: install system libraries required by OpenCV, pdf2image, Tesseract + - name: Install Linux system dependencies + if: runner.os == 'Linux' + run: | + sudo apt-get update -qq + sudo apt-get install -y --no-install-recommends \ + tesseract-ocr tesseract-ocr-dan tesseract-ocr-deu \ + poppler-utils \ + libgtk-3-dev libwebkit2gtk-4.0-dev \ + libglib2.0-dev libcairo2-dev pkg-config \ + python3-dev + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + # Download the Danish spaCy model used for NER/anonymisation + - name: Download spaCy model + run: python -m spacy download da_core_news_sm + + - name: Build Document Scanner + run: python build.py + + # Zip the Linux binary (no installer on Linux) + - name: Package Linux binary + if: runner.os == 'Linux' + run: | + cd dist + zip -r "Document_Scanner_linux_x86_64.zip" "Document Scanner" + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: DocumentScanner-${{ matrix.name }} + retention-days: 30 + path: | + dist/*.exe + dist/Document_Scanner_linux_x86_64.zip + + # ── GDPRScanner ────────────────────────────────────────────────────────── + build-m365-scanner: + strategy: + fail-fast: false + matrix: + include: + - os: windows-latest + name: windows + artifact_glob: "dist/*.exe" + - os: ubuntu-22.04 + name: linux + artifact_glob: "dist/GDPRScanner" + + runs-on: ${{ matrix.os }} + name: GDPRScanner / ${{ matrix.name }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python 3.12 + uses: actions/setup-python@v5 + with: + python-version: "3.12" + cache: pip + + - name: Install Linux system dependencies + if: runner.os == 'Linux' + run: | + sudo apt-get update -qq + sudo apt-get install -y --no-install-recommends \ + libgtk-3-dev libwebkit2gtk-4.0-dev \ + libglib2.0-dev libcairo2-dev pkg-config \ + python3-dev + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + # GDPRScanner only needs a subset — skip OCR/CV heavy deps + pip install flask msal requests openpyxl pillow \ + python-docx \ + pywebview pystray \ + pyinstaller pyinstaller-hooks-contrib + + - name: Build GDPRScanner + run: python build_gdpr.py + + - name: Package Linux binary + if: runner.os == 'Linux' + run: | + cd dist + zip -r "GDPRScanner_linux_x86_64.zip" "GDPRScanner" + + - name: Upload artifact + uses: actions/upload-artifact@v4 + with: + name: M365Scanner-${{ matrix.name }} + retention-days: 30 + path: | + dist/*.exe + dist/M365_Scanner_linux_x86_64.zip + + # ── Release (only on version tags v*) ──────────────────────────────────── + release: + name: Create GitHub Release + needs: [build-document-scanner, build-m365-scanner] + if: startsWith(github.ref, 'refs/tags/v') + runs-on: ubuntu-latest + permissions: + contents: write + + steps: + - name: Download all artifacts + uses: actions/download-artifact@v4 + with: + path: artifacts + merge-multiple: true + + - name: Create release + uses: softprops/action-gh-release@v2 + with: + name: ${{ github.ref_name }} + draft: false + prerelease: ${{ contains(github.ref_name, '-beta') || contains(github.ref_name, '-rc') }} + generate_release_notes: true + files: artifacts/** diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..aa89886 --- /dev/null +++ b/.gitignore @@ -0,0 +1,91 @@ +# VERSION, CHANGELOG.md, LICENSE, README.md — always commit these +# (VERSION is plain text, not JSON, so the *.json rule does not catch it) + +# ── Credentials and config (NEVER commit these) ─────────────────────────────── +*.json +!lang/*.json +!keywords/*.json +!skus/*.json +!package*.json + +# Be explicit about the most sensitive files +.m365_scanner_config.json +.m365_scanner_smtp.json +.m365_scanner_settings.json +.m365_scanner_delta.json +.m365_scanner_checkpoint.json +.m365_scanner_lang +.document_scanner_lang + +# ── Databases (contain personal data) ──────────────────────────────────────── +*.db +*.sqlite +*.sqlite3 + +# ── Audit logs (contain personal data) ─────────────────────────────────────── +*.jsonl +scanner_audit.jsonl + +# ── Python ──────────────────────────────────────────────────────────────────── +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +venv/ +.venv/ +env/ +ENV/ +*.egg-info/ +dist/ +build/ +.eggs/ +pip-wheel-metadata/ +*.egg + +# ── PyInstaller output ──────────────────────────────────────────────────────── +dist/ +build/ +*.spec +*.exe +*.app + +# ── Node (docx generation) ──────────────────────────────────────────────────── +node_modules/ +npm-debug.log* + +# ── macOS ───────────────────────────────────────────────────────────────────── +.DS_Store +.DS_Store? +._* +.Spotlight-V3 +.Trashes +Icon? + +# ── Windows ─────────────────────────────────────────────────────────────────── +Thumbs.db +ehthumbs.db +Desktop.ini +$RECYCLE.BIN/ + +# ── Editor / IDE ────────────────────────────────────────────────────────────── +.vscode/ +.idea/ +*.swp +*.swo +*~ +.project +.settings/ + +# ── Test artifacts ──────────────────────────────────────────────────────────── +.pytest_cache/ +.coverage +htmlcov/ +.tox/ + +# ── Temporary / local ───────────────────────────────────────────────────────── +*.tmp +*.bak +*.orig +tools/ +# Tools folder is created by the installer — not part of the repo diff --git a/ACRONYMS.md b/ACRONYMS.md new file mode 100644 index 0000000..5541d45 --- /dev/null +++ b/ACRONYMS.md @@ -0,0 +1,58 @@ +# Acronyms and Abbreviations + +GDPR-related terms and abbreviations used throughout the GDPR Scanner project. + +## GDPR / Legal + +| Term | Full name | Meaning in context | +|---|---|---| +| GDPR | General Data Protection Regulation | The EU regulation (2016/679) — the primary legal framework the scanner addresses | +| CPR | Centrale Personregister | Danish national personal identification number (DDMMYY-XXXX) | +| PII | Personally Identifiable Information | Any data that can identify a person — names, addresses, phone numbers, IBANs etc. | +| NER | Named Entity Recognition | ML technique (via spaCy) used to detect names, addresses, and organisations in text | +| DPA | Data Protection Authority | Supervisory authority — in Denmark: Datatilsynet | +| DSR | Data Subject Request | A request from an individual to access, correct, or delete their data (Art. 15/17) | +| DPIA | Data Protection Impact Assessment | Risk assessment required before high-risk processing (Art. 35) — not yet in scanner | +| RoPA | Register of Processing Activities | The Article 30 register — what the Art.30 export produces | +| IBAN | International Bank Account Number | Financial identifier detected as sensitive PII | +| SKU | Stock Keeping Unit | In context: Microsoft license product code used to classify student vs staff accounts | + +## GDPR Articles referenced in this project + +| Article | Subject | +|---|---| +| Art. 5(1)(a) | Lawfulness, fairness, transparency | +| Art. 5(1)(b) | Purpose limitation | +| Art. 5(1)(c) | Data minimisation | +| Art. 5(1)(e) | Storage limitation — basis for retention enforcement | +| Art. 5(2) | Accountability — basis for the deletion audit log | +| Art. 8 | Conditions for child consent — age threshold | +| Art. 9 | Special categories of personal data (biometric, health, criminal etc.) | +| Art. 15 | Right of access — basis for data subject lookup | +| Art. 17 | Right to erasure ("right to be forgotten") | +| Art. 30 | Records of processing activities — basis for Article 30 export | +| Art. 35 | Data Protection Impact Assessment | +| Art. 44–46 | Transfers to third countries | +| Art. 89 | Archiving in the public interest — potential basis for retaining historical data | + +## Danish law + +| Term | Meaning | +|---|---| +| Databeskyttelsesloven | Danish Data Protection Act — supplements GDPR in Denmark | +| Databeskyttelsesloven §6 | Sets digital consent age at 15 — below this, parental consent required | +| Bogføringsloven | Danish Bookkeeping Act — requires accounting records for 5 years from end of financial year | +| Datatilsynet | Danish Data Protection Authority — the national supervisory body | + +## Microsoft 365 / Technical + +| Term | Full name | Meaning in context | +|---|---|---| +| M365 | Microsoft 365 | The cloud productivity suite (Exchange, OneDrive, SharePoint, Teams) | +| AAD / Entra | Azure Active Directory / Microsoft Entra ID | Microsoft's identity and access management service | +| MSAL | Microsoft Authentication Library | Library used for OAuth2 authentication against Azure AD | +| UPN | User Principal Name | Microsoft's unique user identifier — typically the user's email address | +| SKU | Stock Keeping Unit | Microsoft license product code (e.g. M365EDU_A3_STUDENT) | +| SPO | SharePoint Online | Microsoft's cloud document management platform | +| SSE | Server-Sent Events | HTTP streaming used to push scan results to the browser in real time | +| ORM | Object-Relational Mapping | Not used — the scanner uses raw SQL via sqlite3 | diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..ef189f4 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,2458 @@ +# Changelog + +All notable changes to GDPR Scanner are documented here. + +Format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). +Version numbers follow [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +--- + +## [Unreleased] + +### Added + +- **`EFFORT_ESTIMATE.md`** — build effort estimate document covering component-by-component hour breakdowns and complexity drivers for the project. +- **Settings → Security tab** — new dedicated pane in the Settings modal. Admin PIN and Viewer PIN groups moved here from the General tab, which now contains only Appearance and About. The Share modal's **Configure** button navigates directly to the Security tab. +- **Viewer mode layout** — the sidebar, log panel, and progress bar are now hidden in viewer mode so results fill the full window width. The `🔍 GDPRScanner` brand is shown in the top-left of the topbar (replacing the sidebar header) at the same size and weight as the normal sidebar title. + +### Fixed + +- **Share modal — Revoke / Copy buttons broken** — `JSON.stringify(token)` produced a double-quoted string that terminated the surrounding `onclick="…"` HTML attribute early, so neither button fired its handler. Both now pass the token as a single-quoted JS string literal, which is safe for the hex token format. +- **Viewer PIN — Clear PIN rejected with "current PIN is incorrect"** — clicking **Clear PIN** without first typing in the Current PIN field sent an empty string to the server, which correctly rejected it. A client-side guard now validates the field is non-empty before sending the request, and focuses the input with an inline error message if it is empty. +- **Share modal — all UI strings now translated** — the Share results modal and Viewer PIN settings group were fully hardcoded in English. All visible strings are now backed by i18n keys (`share_*`, `viewer_pin_*`) in `en.json`, `da.json`, and `de.json`. + +--- + +## [1.6.14] — 2026-04-10 + +### Added — read-only viewer mode (#33) + +A DPO, school principal, or compliance coordinator can now review scan results and tag dispositions without access to scan controls, credentials, or settings. + +**Token links** + +- New `🔗` **Share** button in the topbar opens a token management modal. +- **Create** generates a 64-char hex token (`secrets.token_hex(32)`) with an optional label and expiry (7 d / 30 d / 90 d / 1 yr / never). +- **Copy** copies the full `http://host:5100/view?token=…` URL to the clipboard. +- **Revoke** deletes the token immediately; any browser using it is locked out on next navigation. +- Tokens are stored in `~/.gdprscanner/viewer_tokens.json` with `created_at`, `expires_at`, and `last_used_at` metadata. Expired tokens are cleaned up on each list fetch. + +**PIN alternative** + +- A 4–8 digit numeric PIN can be set in **Settings → General → Viewer PIN**. +- Opening `/view` without a token shows a PIN entry form (`templates/viewer_pin.html`). +- Correct PIN sets a Flask session cookie (`session["viewer_ok"]`) valid for the browser session — no token needed after that. +- Brute-force guard: 5 failed attempts per 5 minutes per IP returns 429. +- PIN stored as salted SHA-256 inside `viewer_tokens.json` (no extra dependencies). + +**`/view` route** + +- Checks `?token=` first (validates + binds session), then existing session cookie, then PIN form (if a PIN is configured), then 403. +- Serves the same `index.html` with `window.VIEWER_MODE = true` injected. +- Invalid/expired tokens show `templates/viewer_denied.html`. + +**Viewer mode (JS)** + +- `auth.js` — bypasses M365 auth check entirely; adds `viewer-mode` class to ``; shows scanner screen immediately. +- `results.js` — on `DOMContentLoaded` calls `_loadViewerResults()` which fetches `GET /api/db/flagged` (all items from the last completed scan session, joined with dispositions) and renders the grid directly — no SSE required. +- CSS (`body.viewer-mode`) hides: Sources/Options/Accounts sidebar panels; Scan/Stop buttons; profile bar; config-group buttons; resume banner; bulk-delete button; per-card delete button; data-subject delete button; Share button. +- Disposition tagging (select + Save) remains fully functional — `/api/db/disposition` has no auth guard. +- Filter bar, Excel export, Art.30 export, preview panel, and log remain accessible. + +**New files:** `routes/viewer.py`, `static/js/viewer.js`, `templates/viewer_pin.html`, `templates/viewer_denied.html` + +**Files changed:** `app_config.py`, `gdpr_scanner.py`, `templates/index.html`, `static/style.css`, `static/js/auth.js`, `static/js/results.js`, `static/js/scheduler.js`, `routes/database.py` + +--- + +### Fixed — memory exhaustion during large M365 scans + +Addressed root causes of runaway memory growth (reported: up to 90 GB RSS) that could crash the host machine during scans of large Microsoft 365 tenants. + +**`scan_engine.py`** + +- **Email body HTML stripped at collection time** — Graph API returns the full `body` field (raw HTML, up to ~1 MB per message) for every email fetched. Previously, all message dicts — including the raw HTML — were accumulated in `work_items` before any scanning began. For 1 000 users × 2 000 emails this could mean >100 GB in `work_items` alone. The body is now converted to plain text immediately on collection (`_precomputed_body`), and the raw `body` and `bodyPreview` keys are deleted from the dict before it is queued. The processing loop reads `_precomputed_body` via `pop()` and `del`s it after use. +- **`work_items` converted to `deque` before processing** — items are now released from memory one by one via `popleft()` as they are processed, rather than keeping the entire list alive for the duration of the scan. `gc.collect()` is called immediately after conversion and after each checkpoint save. +- **`content` bytes freed as early as possible in the file processing branch** — raw download bytes are now `del`'d immediately after `content.decode()` (before the expensive NER/PII pass), and also in the no-hits `else` branch where they were previously kept alive until the next loop iteration. +- **`body_text` freed after use in the email branch** — `del body_text` added after `_broadcast_card` so large plain-text bodies do not linger until the next iteration. +- **Memory guard before file downloads** — uses `psutil.virtual_memory().available` to skip a file download and log a warning if fewer than 300 MB of RAM are available, preventing a single large file from pushing an already-pressured machine into OOM. + +**`document_scanner.py`** + +- **PDF OCR page images freed page by page** — `convert_from_path()` renders all pages at 300 DPI before scanning begins (~26 MB per A4 page; a 100-page PDF ≈ 2.6 GB). Each rendered `PIL.Image` is now nulled out (`images[page_num-1] = None`) immediately after OCR, so only one page image is live at a time instead of the entire document. + +### Changed — Sources panel is now resizable and collapsible + +The **KILDER** sidebar panel now behaves consistently with the other sidebar sections. + +- **Collapsible** — the `▾` / `▸` toggle was already wired up; collapse state is already persisted in `localStorage`. No change needed here. +- **Resizable** — a drag handle (`sources-resize-handle`) added at the bottom of the panel body. Dragging up shrinks the panel (scroll appears); dragging down is capped at the panel's natural content height — you cannot expand it beyond what is needed to show all sources. Height preference persisted in `localStorage` under `gdpr_sources_h`. +- **Auto-fit on render** — `_fitSourcesPanel()` is called at the end of every `renderSourcesPanel()` invocation. On first load and whenever sources are added or removed (e.g. connecting Google), the panel height snaps to exactly fit all visible sources. A previously saved smaller height is honoured only if it is still smaller than the new content height; dragging back to full height clears the saved preference. +- The old `max-height: calc(5 * 26px)` fixed cap is removed. + +**Files changed:** `templates/index.html`, `static/style.css`, `static/js/log.js` (`_fitSourcesPanel`, `_initSourcesResize`), `static/js/sources.js`, `static/js/results.js`. + +--- + +## [1.6.13] — 2026-04-10 + +### Added — developer tooling + +- **`run_tests.sh`** — shell script to activate the venv and run the full test suite. Accepts any `pytest` arguments: `./run_tests.sh`, `./run_tests.sh -q`, `./run_tests.sh tests/test_app_config.py`. +- **Directory-scoped `CLAUDE.md` rules** — `routes/CLAUDE.md`, `static/js/CLAUDE.md`, `templates/CLAUDE.md`, `lang/CLAUDE.md` replace the previous single-file context document. Each file is loaded automatically by Claude Code only when working in the relevant directory. + +### Fixed — documentation + +- **`README.md` project files table** — removed four phantom entries (`Dockerfile`, `docker-compose.yml`, `.dockerignore`, `scanner_audit.jsonl`); corrected `static/app.js` description to "archived monolith — no longer loaded"; fixed manual paths (`MANUAL-EN.md` → `docs/manuals/MANUAL-EN.md`); added missing files: `scan_engine.py`, `sse.py`, `checkpoint.py`, `app_config.py`, `cpr_detector.py`, `google_connector.py`, `static/style.css`, `static/js/*.js`, `routes/google_auth.py`, `routes/google_scan.py`, `run_tests.sh`, `docs/setup/` guides. +- **`docs/manuals/MANUAL-EN.md`**, **`docs/manuals/MANUAL-DA.md`** — version header updated from 1.6.11 → 1.6.13; footer updated from v1.6.8 → v1.6.13. + +### Changed — blueprint migration batch 3, 4, 5 (auth, database, export — migration complete) + +All remaining direct `@app.route` registrations removed from `gdpr_scanner.py`. Flask now routes every API endpoint exclusively through its blueprint. Only `GET /` and `GET /api/scan/stream` (SSE) remain in `gdpr_scanner.py`. + +**`routes/auth.py`** — rewritten with direct imports (batch 3, 6 routes): +- `MSAL_OK`, `M365Connector`, `M365Error` imported from `m365_connector` +- `_load_config`, `_save_config` imported from `app_config` +- Dead module-level globals `_pending_flow` and `_auth_poll_result` removed from `gdpr_scanner.py` +- Routes removed: `/api/auth/status`, `/api/auth/start`, `/api/auth/poll`, `/api/auth/userinfo`, `/api/auth/signout`, `/api/auth/config` + +**`routes/database.py`** — rewritten with direct imports (batch 4, 15 routes): +- `_get_db`, `DB_OK` from `gdpr_db`; `_set_admin_pin`, `_verify_admin_pin`, `_admin_pin_is_set` from `app_config`; `_clear_checkpoint`, `_DELTA_PATH` from `checkpoint`; `_extract_exif`, `_html_esc`, `_placeholder_svg` from `cpr_detector` +- `SCANNER_OK` determined by local `import document_scanner` try/except +- `db_export` improved: uses `NamedTemporaryFile` instead of `mktemp` (safer for frozen apps) +- Email preview HTML: full CSS ruleset (`*, *::before, *::after`, `img`, `table`, scrollbar) from gdpr_scanner.py version restored +- Routes removed: `/api/db/stats`, `/api/db/trend`, `/api/db/scans`, `/api/db/subject`, `/api/db/overdue`, `/api/db/disposition` (×2), `/api/db/deletion_log`, `/api/db/reset`, `/api/admin/pin` (×2), `/api/db/export`, `/api/db/import`, `/api/preview/`, `/api/thumb` + +**`routes/export.py`** — rewritten with direct imports (batch 5, 3 routes): +- `_get_db`, `DB_OK` from `gdpr_db`; `_GUID_RE`, `_resolve_display_name` from `app_config`; `M365PermissionError` from `m365_connector` +- `app.logger` replaced with `logging.getLogger(__name__)` +- Dead `delete_item()` helper removed from `gdpr_scanner.py` (was unreachable; blueprint has its own copy) +- Routes removed: `/api/export_excel`, `/api/export_article30`, `/api/delete_bulk` + +**`tests/test_routes.py`** — `db_patch` fixture updated: now patches `routes.database._get_db` / `routes.database.DB_OK` and `routes.export._get_db` / `routes.export.DB_OK` (was patching `gdpr_scanner._get_db`/`gdpr_scanner.DB_OK` which no longer have any effect). Two `test_without_db_returns_503` tests updated to monkeypatch `routes.database.DB_OK` instead of `gdpr_scanner.DB_OK`. + +--- + +## [1.6.12] — 2026-04-10 + +### Fixed — profile editor save drops users from non-active role groups + +In `_pmgmtSaveFullEdit` (profile management editor), the save function applied the active role filter (`_pmgmtRoleActive`) to the list of checked checkboxes before saving. Since `_pmgmtFilterAccounts` hides rows via `display:none` but does not uncheck them, users from other role groups that remained checked (but hidden) were silently discarded on save. The role filter at save time is removed — all checked checkboxes are now captured regardless of which role tab is visible. + +--- + +## [1.6.11] — 2026-04-10 + +### Changed — blueprint migration batch 1 (scan + app_routes) + +15 direct `@app.route` registrations removed from `gdpr_scanner.py`. Flask now routes all of these exclusively through their blueprint counterparts, which previously existed as dead code shadowed by the direct routes. + +**`routes/scan.py`** — rewritten with direct imports (was entirely non-functional as dead code due to bare-name `NameError`s behind the shadow): +- Added `GET /api/scan/status` (new — was only in gdpr_scanner.py) +- Added `GET /api/src_toggles`, `POST /api/src_toggles` (new — was only in gdpr_scanner.py) +- `scan_checkpoint_info` — added missing `check_only` handling present in the gdpr_scanner.py version +- All state references converted from bare names to `state._scan_lock` / `state._scan_abort`; `run_scan` imported lazily from `scan_engine` inside `_run` to avoid circular imports +- `_save_settings`, `_load_settings`, `_load_src_toggles`, `_save_src_toggles` imported from `app_config` +- `_checkpoint_key`, `_load_checkpoint`, `_clear_checkpoint`, `_load_delta_tokens`, `_DELTA_PATH` imported from `checkpoint` + +**`routes/app_routes.py`** — cleaned up: +- `APP_VERSION` now computed locally from `VERSION` file (was a bare-name reference to gdpr_scanner.py global) +- `_LANG_DIR` computed at module level; fixed `sys` / `_sys` alias mismatch in `get_langs` (bug in blueprint that never manifested while shadowed) +- `_set_lang_override`, `_load_lang_forced` imported directly from `app_config` +- `get_langs` — added missing `langs.sort()` present in the gdpr_scanner.py version + +**`tests/test_routes.py`** — `mock_connector` fixture simplified: no longer needs to patch `gdpr_scanner._connector` since the direct `scan/start` route is gone; `state.connector` alone is sufficient. `run_scan` stub in `test_authenticated_returns_started` updated to target `scan_engine` directly. + +**Routes removed from `gdpr_scanner.py`:** `/api/about`, `/api/langs`, `/api/set_lang`, `/api/lang`, `/api/scan/status`, `/api/scan/start`, `/api/scan/stop`, `/api/scan/checkpoint`, `/api/scan/clear_checkpoint`, `/api/settings/save`, `/api/settings/load`, `/api/src_toggles`, `/api/delta/status`, `/api/delta/clear` + +**Still in `gdpr_scanner.py`:** `GET /` (root), `GET /api/scan/stream` (SSE — cannot be in a blueprint), and the `auth`, `users`, `sources`, `database`, `export` route groups (31 routes — next batches). + +--- + +## [1.6.10] — 2026-04-10 + +### Fixed — Google Drive `exportSizeLimitExceeded` warning + +Native Google Workspace files too large for Drive's export API (Google's server-side limit, distinct from the 20 MB local cap) now produce a clean skip message instead of a stray `WARNING googleapiclient.http — Encountered 403 Forbidden with reason "exportSizeLimitExceeded"` in the log. A `logging.Filter` subclass is installed on the `googleapiclient.http` logger at import time to suppress the duplicate external warning; the `except HttpError` block in `_drive_iter` detects the reason and logs `[gdrive] skip '' — file too large for Google export API (exportSizeLimitExceeded)` with the file ID. + +### Fixed — peak memory during large file/SMB scans (OOM risk reduction) + +Three targeted buffer-lifetime fixes reduce peak RSS during large scans: + +- **`cpr_detector.py`** — `del content` after writing the PDF bytes to a temp file in `_scan_bytes_timeout`. The 20 MB buffer was previously held in the main process for the entire duration of `p.join(timeout)` (up to 60 s), overlapping with the spawned subprocess's ~150–300 MB heap. It is now freed before the subprocess starts. +- **`scan_engine.py`** — `del content` after the thumbnail block in `run_file_scan`. The raw file buffer was kept alive through card dict construction and the start of the next loop iteration; it is now freed as soon as the thumbnail (or placeholder SVG) has been generated. +- **`file_scanner.py`** — `PREFETCH_WINDOW` reduced from 2 to 1. Halves the maximum number of concurrently-held SMB read buffers (from 2 × 20 MB to 1 × 20 MB). + +--- + +## [1.6.9] — 2026-04-10 + +### Changed — frontend migrated to ES modules + +**Phase 2 complete:** All 10 split JS files converted from ` + + +
+ + + + + +
+ + +
+
+
Connect to Microsoft 365
+
Enter your Azure app credentials to sign in.
+ +
+
+ + +
+
+ + +
+
+ + +
+
+ Client Secret: app accesses all users' data directly (Application permissions, no sign-in required).
+ you sign in as yourself and can only scan your own data unless you're a Global Admin. +
+
+ +
+
+
+
+ + + + +
+
+ + +
+
+

Connect to Microsoft 365

+
+ +
+
and enter this code
+
+
⏳ Waiting for sign-in…
+ +
+
+ + +
+
+

+
+
+
+ + +
+ +
+
+ + +
+
+

Bulk Delete

+
Permanently removes items from Microsoft 365. Emails go to Deleted Items; files go to the recycle bin.
+ +
Filter what to delete
+
+ + +
+ +
+ + +
+
+ + +
+
+ + +
+ +
+ +
+ + +
+
+
+
+ + +
+
+
+

⚙ Settings

+ +
+
+ + + + + +
+
+ + +
+
+
Appearance
+
+ + +
+
+ + +
+
+
+
About
+
🔍 GDPRScannerv{{ app_version }}
+
Python
+
MSAL
+
Requests
+
openpyxl
+
+
+ + +
+
+
Admin PIN
+
Required for destructive actions (e.g. Reset DB). Leave blank to disable.
+
+ +
+ + +
+
+ + +
+
+
+ +
+
+
+
Viewer PIN
+
A numeric PIN (4–8 digits) that lets anyone open /view in a browser for read-only access to results without a token URL.
+
+ +
+ + +
+
+
+ + +
+
+
+ + +
+ + +
+
🕐 Scheduled scans
+
Run scans automatically at a set time. Requires an active M365 connection (application mode recommended).
+ +
+ +
+ + + + + +
+
Recent runs
+
+
+ +
+ + +
+
+
Email report (SMTP)
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+ + +
+
+
+ + + +
+
+
+ + +
+
+
Database
+
+
+
+
Actions
+
+
+ + +
+ +
+
+
+ +
+ +
+
+ + +
+
+
+

Enter admin PIN

+ +
+
+
+ +
+
+ +
+
+ + + +
+
+

🔍 Data subject lookup

+
Find all flagged items containing a given CPR number. The CPR is hashed before querying and is never stored in plaintext.
+
+ + +
+
+
+ +
+
+ + +
+
+

✉ Email report

+
Configure SMTP settings to send the scan report by email.
+ +
+ +
+ + +
+
+ + +
+ + +
+ + +
+
+ + +
+ + +
+ + +
+ + +
+
+
+
+ + STARTTLS + (port 587) +
+
+ + SSL + (port 465) +
+
+
+ + +
+
+ + +
Comma or semicolon separated
+
+
+ + +
+
+
+ + +
+
+

Share results

+
Read-only links let a DPO or reviewer browse results and tag dispositions without access to scan controls or credentials.
+ + +
+
New link
+
+
+
Label (optional)
+ +
+
+
Expires in
+ +
+ +
+ +
+ + +
Active links
+
+ + +
+ Viewer PIN: + +
+ +
+ +
+
+
+ +
+
+

🔍 GDPRScanner

+
v{{ app_version }}
+
Python
+
MSAL
+
Requests
+
openpyxl
+ +
+
+ + +
+
+
+

⚙️ Source management

+ +
+ + +
+ + + +
+ +
+ + +
+ + +
+
Connection
+
+ ☁️ +
+
Not connected
+
+
+ +
+
+ + +
+
Azure credentials
+
+
+ + +
+
+ + +
+
+ + +
+
+
+ + +
+
+
+ + +
+
Sources to scan
+
+
+ 📧 +
Exchange / Outlook
+ +
+
+ 💾 +
OneDrive
+ +
+
+ 🌐 +
SharePoint
+ +
+
+ 💬 +
Teams
+ +
+
+
+
+ + + +
+ + +
+
Connection
+
+ 🔵 +
+
Not connected
+
+
+ +
+
+ + +
+
Auth mode
+
+ + +
+
+ + +
+
Service account credentials
+
+
+ +
+ + +
+
Download from Google Cloud Console → IAM & Admin → Service Accounts → Keys → Add Key → JSON
+
+
+ + +
+
Used for domain-wide delegation — must be a Workspace super-admin.
+
+
+ + +
+
+
+ + + + + + + + +
+
+ Setup required in Google Workspace:
+ 1. Create a Google Cloud project and enable Gmail API + Drive API + Admin SDK.
+ 2. Create a service account, download the JSON key, and enable domain-wide delegation.
+ 3. In Workspace Admin → Security → API Controls → Domain-wide delegation, add the service account client ID with scopes:
+ https://www.googleapis.com/auth/gmail.readonly, https://www.googleapis.com/auth/drive.readonly, https://www.googleapis.com/auth/admin.directory.user.readonly +
+
+ +
+ + +
+
+
File sources
+
+
No file sources yet.
+
+
+ + +
+
Add source
+
+
+ + +
+
+ + +
+ +
+ +
+ +
+
+
+
+ +
+ + +
+
+ + + +
+
+

📁 File Sources

+
+
No file sources yet. Add a local folder or network share below.
+
+ + +
+
Add source
+
+ + +
+
+ + +
+ +
+ +
+
+ +
+ +
+
+ + +
+
+
+
+ Profiler +
+
+
No saved profiles yet.
+
+
+ +
+
+
+
+ Rediger profil + +
+
Klik på en profil for at redigere
+
+ + +
+
+
+
+ +
+
+

📥 Import Database

+

Select a previously exported .zip file. Merge adds dispositions and deletion log. Replace wipes and fully restores.

+
+ + +
+
+ + +
+ +
+
+ + +
+
+
+ + + + + + + + + + + + + + diff --git a/templates/viewer_denied.html b/templates/viewer_denied.html new file mode 100644 index 0000000..57eccad --- /dev/null +++ b/templates/viewer_denied.html @@ -0,0 +1,28 @@ + + + + + + GDPRScanner — Access denied + + + + +
+

Access denied

+

This link is invalid or has expired.
Ask the administrator for a new link.

+
+ + diff --git a/templates/viewer_pin.html b/templates/viewer_pin.html new file mode 100644 index 0000000..2b8f416 --- /dev/null +++ b/templates/viewer_pin.html @@ -0,0 +1,82 @@ + + + + + + GDPRScanner — Enter PIN + + + + +
+

GDPRScanner

+

Enter the viewer PIN to access results.

+ + +
+
+ + + diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..3e48f03 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,113 @@ +""" +conftest.py — shared fixtures for GDPRScanner test suite. +""" +import sys +import tempfile +from pathlib import Path + +import pytest + +# Ensure the project root is on sys.path so all modules are importable +ROOT = Path(__file__).parent.parent +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) + + +# ── File fixtures ───────────────────────────────────────────────────────────── + +@pytest.fixture() +def tmp_dir(tmp_path): + return tmp_path + + +@pytest.fixture() +def docx_with_cpr(tmp_path): + """Word document containing 3 CPR numbers in different positions.""" + from docx import Document + doc = Document() + doc.add_paragraph("Elev 1: CPR 290472-1234 er registreret i systemet.") + doc.add_paragraph("Elev 2: personnummer 010185-4321.") + tbl = doc.add_table(rows=2, cols=2) + tbl.cell(0, 0).text = "Navn" + tbl.cell(0, 1).text = "CPR" + tbl.cell(1, 0).text = "Anne Hansen" + tbl.cell(1, 1).text = "CPR: 150364-5678" + p = tmp_path / "sample_with_cpr.docx" + doc.save(p) + return p + + +@pytest.fixture() +def docx_no_cpr(tmp_path): + """Word document with no CPR numbers.""" + from docx import Document + doc = Document() + doc.add_paragraph("Ingen personoplysninger her.") + doc.add_paragraph("Konto: 1234-5678 Telefon: 33 12 34 56") + p = tmp_path / "sample_no_cpr.docx" + doc.save(p) + return p + + +@pytest.fixture() +def xlsx_with_cpr(tmp_path): + """Excel workbook containing 1 CPR in a cell.""" + from openpyxl import Workbook + wb = Workbook() + ws = wb.active + ws["A1"] = "Navn" + ws["B1"] = "CPR" + ws["A2"] = "Test Person" + ws["B2"] = "CPR: 290472-1234" + p = tmp_path / "sample_with_cpr.xlsx" + wb.save(p) + return p + + +@pytest.fixture() +def xlsx_no_cpr(tmp_path): + """Excel workbook with account numbers that look CPR-like.""" + from openpyxl import Workbook + wb = Workbook() + ws = wb.active + ws["A1"] = "Kontonummer" + ws["B1"] = "Beløb" + ws["A2"] = "12345678" # 8-digit — too short + ws["A3"] = "29047212345" # 11-digit — too long + ws["A4"] = "Reg: 2904" + p = tmp_path / "sample_no_cpr.xlsx" + wb.save(p) + return p + + +@pytest.fixture() +def txt_with_art9(tmp_path): + """Plain text with CPR adjacent to Article 9 health keywords.""" + content = ( + "Eleven CPR 290472-1234 har diagnosen diabetes og modtager behandling.\n" + "Kontakt læge vedr. sygemelding." + ) + p = tmp_path / "sample_art9.txt" + p.write_text(content, encoding="utf-8") + return p + + +@pytest.fixture() +def binary_garbage(tmp_path): + """Binary file that must not crash the scanner.""" + p = tmp_path / "sample_binary.bin" + p.write_bytes(bytes(range(256)) * 100) + return p + + +@pytest.fixture() +def tmp_db(tmp_path): + """Fresh in-memory-path SQLite DB for each test.""" + from gdpr_db import ScanDB + db_path = tmp_path / "test.db" + db = ScanDB(str(db_path)) + yield db + try: + db_path.unlink() + except Exception: + pass diff --git a/tests/test_app_config.py b/tests/test_app_config.py new file mode 100644 index 0000000..8aa96ce --- /dev/null +++ b/tests/test_app_config.py @@ -0,0 +1,254 @@ +""" +test_app_config.py — Tests for app_config.py. + +Covers: + - LANG loading and key access + - Article 9 keyword detection (_check_special_category) + - Config load/save round-trip + - Admin PIN hash/verify + - Profile CRUD (_profile_save, _profile_get, _profile_delete) + - SMTP password encryption/decryption round-trip +""" +import sys +import json +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).parent.parent)) +import app_config + + +# ───────────────────────────────────────────────────────────────────────────── +# 1. i18n +# ───────────────────────────────────────────────────────────────────────────── + +class TestLang: + + def test_lang_dict_loaded(self): + assert isinstance(app_config.LANG, dict) + assert len(app_config.LANG) > 0 + + def test_lang_has_lang_code(self): + assert "_lang_code" in app_config.LANG + + def test_load_lang_returns_dict(self): + lang = app_config._load_lang() + assert isinstance(lang, dict) + + def test_load_lang_forced_en(self): + lang = app_config._load_lang_forced("en") + assert isinstance(lang, dict) + assert len(lang) > 0 + + def test_load_lang_forced_da(self): + lang = app_config._load_lang_forced("da") + assert isinstance(lang, dict) + assert len(lang) > 0 + + def test_load_lang_forced_de(self): + lang = app_config._load_lang_forced("de") + assert isinstance(lang, dict) + assert len(lang) > 0 + + def test_missing_lang_falls_back(self): + # Unknown lang code should fall back without raising + lang = app_config._load_lang_forced("xx") + assert isinstance(lang, dict) + + +# ───────────────────────────────────────────────────────────────────────────── +# 2. Article 9 keyword detection +# ───────────────────────────────────────────────────────────────────────────── + +class TestCheckSpecialCategory: + + def _cats(self, text): + cprs = [{"raw": "290472-1234"}] + return app_config._check_special_category(text, cprs) + + def test_health_keyword_detected(self): + cats = self._cats("CPR: 290472-1234 har diagnosen diabetes og behandling") + assert "health" in cats + + def test_trade_union_keyword_detected(self): + cats = self._cats("CPR: 290472-1234 er fagforeningsmedlem tillidsrepræsentant") + assert "trade_union" in cats + + def test_religion_keyword_detected(self): + cats = self._cats("CPR: 290472-1234 kirke konfirmation") + assert "religion" in cats + + def test_no_keyword_returns_empty(self): + cats = self._cats("CPR: 290472-1234 bor i Aarhus") + assert cats == [] + + def test_empty_text_returns_empty(self): + cats = app_config._check_special_category("", []) + assert cats == [] + + def test_keyword_without_cpr_still_detected(self): + # No CPR — keyword still triggers if no CPR list given + cats = app_config._check_special_category("diagnose sygemelding behandling", []) + assert "health" in cats + + def test_returns_sorted_list(self): + cats = self._cats("CPR 290472-1234 diabetes fagforening") + assert cats == sorted(cats) + + def test_compiled_keywords_populated(self): + assert len(app_config._compiled_keywords) > 0 + + def test_keyword_flat_has_entries(self): + assert len(app_config._keyword_flat) > 0 + + +# ───────────────────────────────────────────────────────────────────────────── +# 3. Config load / save +# ───────────────────────────────────────────────────────────────────────────── + +class TestConfig: + + def test_load_config_returns_dict(self, tmp_path, monkeypatch): + monkeypatch.setattr(app_config, "_CONFIG_FILE", tmp_path / "config.json") + cfg = app_config._load_config() + assert isinstance(cfg, dict) + + def test_save_and_load_round_trip(self, tmp_path, monkeypatch): + monkeypatch.setattr(app_config, "_CONFIG_FILE", tmp_path / "config.json") + app_config._save_config({"client_id": "test-id", "tenant_id": "test-tid"}) + cfg = app_config._load_config() + assert cfg["client_id"] == "test-id" + assert cfg["tenant_id"] == "test-tid" + + def test_save_config_creates_file(self, tmp_path, monkeypatch): + cfg_path = tmp_path / "config.json" + monkeypatch.setattr(app_config, "_CONFIG_FILE", cfg_path) + app_config._save_config({"x": 1}) + assert cfg_path.exists() + + def test_load_missing_file_returns_empty(self, tmp_path, monkeypatch): + monkeypatch.setattr(app_config, "_CONFIG_FILE", tmp_path / "nonexistent.json") + cfg = app_config._load_config() + assert cfg == {} + + +# ───────────────────────────────────────────────────────────────────────────── +# 4. Admin PIN +# ───────────────────────────────────────────────────────────────────────────── + +class TestAdminPin: + + def test_pin_not_set_initially(self, tmp_path, monkeypatch): + monkeypatch.setattr(app_config, "_CONFIG_FILE", tmp_path / "config.json") + # Fresh config — no PIN + app_config._save_config({}) + assert app_config._admin_pin_is_set() is False + + def test_set_and_verify_pin(self, tmp_path, monkeypatch): + monkeypatch.setattr(app_config, "_CONFIG_FILE", tmp_path / "config.json") + app_config._save_config({}) + app_config._set_admin_pin("1234") + assert app_config._verify_admin_pin("1234") is True + + def test_wrong_pin_fails(self, tmp_path, monkeypatch): + monkeypatch.setattr(app_config, "_CONFIG_FILE", tmp_path / "config.json") + app_config._save_config({}) + app_config._set_admin_pin("1234") + assert app_config._verify_admin_pin("9999") is False + + def test_pin_is_set_after_setting(self, tmp_path, monkeypatch): + monkeypatch.setattr(app_config, "_CONFIG_FILE", tmp_path / "config.json") + app_config._save_config({}) + app_config._set_admin_pin("5678") + assert app_config._admin_pin_is_set() is True + + +# ───────────────────────────────────────────────────────────────────────────── +# 5. Profiles +# ───────────────────────────────────────────────────────────────────────────── + +class TestProfiles: + + @pytest.fixture(autouse=True) + def _isolate(self, tmp_path, monkeypatch): + monkeypatch.setattr(app_config, "_SETTINGS_PATH", tmp_path / "settings.json") + + def test_profiles_load_returns_list(self): + profiles = app_config._profiles_load() + assert isinstance(profiles, list) + + def test_save_and_get_profile(self): + profile = { + "id": "test-uuid-1", + "name": "Test Profile", + "sources": ["email"], + "user_ids": "all", + "options": {}, + } + app_config._profile_save(profile) + loaded = app_config._profile_get("Test Profile") + assert loaded is not None + assert loaded["name"] == "Test Profile" + + def test_profile_get_by_id(self): + profile = {"id": "uid-42", "name": "By ID", "sources": [], "options": {}} + app_config._profile_save(profile) + loaded = app_config._profile_get("uid-42") + assert loaded is not None + + def test_profile_delete(self): + profile = {"id": "del-1", "name": "To Delete", "sources": [], "options": {}} + app_config._profile_save(profile) + deleted = app_config._profile_delete("To Delete") + assert deleted is True + assert app_config._profile_get("To Delete") is None + + def test_delete_nonexistent_returns_false(self): + assert app_config._profile_delete("Does Not Exist") is False + + def test_profiles_load_after_save(self): + app_config._profile_save({"id": "p1", "name": "P1", "sources": [], "options": {}}) + app_config._profile_save({"id": "p2", "name": "P2", "sources": [], "options": {}}) + profiles = app_config._profiles_load() + names = [p["name"] for p in profiles] + assert "P1" in names + assert "P2" in names + + +# ───────────────────────────────────────────────────────────────────────────── +# 6. SMTP password encryption +# ───────────────────────────────────────────────────────────────────────────── + +class TestFernet: + + @pytest.fixture(autouse=True) + def _isolate(self, tmp_path, monkeypatch): + monkeypatch.setattr(app_config, "_MACHINE_ID_PATH", tmp_path / "machine_id") + + def test_encrypt_decrypt_round_trip(self): + fernet = app_config._get_fernet() + if fernet is None: + pytest.skip("cryptography not installed") + plaintext = "my-secret-smtp-password" + encrypted = app_config._encrypt_password(plaintext) + decrypted = app_config._decrypt_password(encrypted) + assert decrypted == plaintext + + def test_encrypt_returns_string(self): + fernet = app_config._get_fernet() + if fernet is None: + pytest.skip("cryptography not installed") + result = app_config._encrypt_password("test") + assert isinstance(result, str) + + def test_encrypted_differs_from_plaintext(self): + fernet = app_config._get_fernet() + if fernet is None: + pytest.skip("cryptography not installed") + enc = app_config._encrypt_password("password123") + assert enc != "password123" + + def test_decrypt_empty_returns_empty(self): + result = app_config._decrypt_password("") + assert result == "" diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py new file mode 100644 index 0000000..abb550d --- /dev/null +++ b/tests/test_checkpoint.py @@ -0,0 +1,147 @@ +""" +test_checkpoint.py — Tests for checkpoint.py. + +Covers: + - _checkpoint_key: stable hashing of scan options + - _save_checkpoint / _load_checkpoint / _clear_checkpoint + - _load_delta_tokens / _save_delta_tokens +""" +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).parent.parent)) +import checkpoint + + +# ───────────────────────────────────────────────────────────────────────────── +# Fixtures +# ───────────────────────────────────────────────────────────────────────────── + +@pytest.fixture(autouse=True) +def _isolate(tmp_path, monkeypatch): + """Redirect all disk writes to a temp dir for each test.""" + monkeypatch.setattr(checkpoint, "_CHECKPOINT_PATH", tmp_path / "checkpoint.json") + monkeypatch.setattr(checkpoint, "_DELTA_PATH", tmp_path / "delta.json") + + +_OPTS = { + "sources": ["email", "onedrive"], + "user_ids": [{"id": "user-1"}, {"id": "user-2"}], + "options": {"older_than_days": 365}, +} + + +# ───────────────────────────────────────────────────────────────────────────── +# 1. _checkpoint_key +# ───────────────────────────────────────────────────────────────────────────── + +class TestCheckpointKey: + + def test_returns_string(self): + key = checkpoint._checkpoint_key(_OPTS) + assert isinstance(key, str) + + def test_key_is_hex(self): + key = checkpoint._checkpoint_key(_OPTS) + int(key, 16) # raises ValueError if not hex + + def test_same_options_same_key(self): + assert checkpoint._checkpoint_key(_OPTS) == checkpoint._checkpoint_key(_OPTS) + + def test_different_sources_different_key(self): + opts2 = {**_OPTS, "sources": ["sharepoint"]} + assert checkpoint._checkpoint_key(_OPTS) != checkpoint._checkpoint_key(opts2) + + def test_different_users_different_key(self): + opts2 = {**_OPTS, "user_ids": [{"id": "user-99"}]} + assert checkpoint._checkpoint_key(_OPTS) != checkpoint._checkpoint_key(opts2) + + def test_source_order_irrelevant(self): + opts_a = {**_OPTS, "sources": ["email", "onedrive"]} + opts_b = {**_OPTS, "sources": ["onedrive", "email"]} + assert checkpoint._checkpoint_key(opts_a) == checkpoint._checkpoint_key(opts_b) + + def test_empty_options(self): + key = checkpoint._checkpoint_key({}) + assert isinstance(key, str) and len(key) > 0 + + +# ───────────────────────────────────────────────────────────────────────────── +# 2. Save / load / clear +# ───────────────────────────────────────────────────────────────────────────── + +class TestSaveLoadCheckpoint: + + def test_load_returns_none_when_no_file(self): + key = checkpoint._checkpoint_key(_OPTS) + assert checkpoint._load_checkpoint(key) is None + + def test_save_then_load(self): + key = checkpoint._checkpoint_key(_OPTS) + checkpoint._save_checkpoint( + key, + scanned_ids={"id1", "id2", "id3"}, + flagged=[{"id": "c1", "name": "file.docx"}], + meta={"started_at": 1700000000}, + ) + loaded = checkpoint._load_checkpoint(key) + assert loaded is not None + + def test_scanned_ids_preserved(self): + key = checkpoint._checkpoint_key(_OPTS) + checkpoint._save_checkpoint(key, {"id1", "id2"}, [], {}) + loaded = checkpoint._load_checkpoint(key) + assert set(loaded["scanned_ids"]) == {"id1", "id2"} + + def test_flagged_items_preserved(self): + key = checkpoint._checkpoint_key(_OPTS) + cards = [{"id": "c1"}, {"id": "c2"}] + checkpoint._save_checkpoint(key, set(), cards, {}) + loaded = checkpoint._load_checkpoint(key) + assert len(loaded["flagged"]) == 2 + + def test_wrong_key_returns_none(self): + key = checkpoint._checkpoint_key(_OPTS) + checkpoint._save_checkpoint(key, {"id1"}, [], {}) + other_opts = {**_OPTS, "sources": ["sharepoint"]} + other_key = checkpoint._checkpoint_key(other_opts) + assert checkpoint._load_checkpoint(other_key) is None + + def test_clear_removes_file(self, tmp_path): + key = checkpoint._checkpoint_key(_OPTS) + checkpoint._save_checkpoint(key, {"id1"}, [], {}) + checkpoint._clear_checkpoint() + assert checkpoint._load_checkpoint(key) is None + + def test_clear_on_missing_file_does_not_raise(self): + checkpoint._clear_checkpoint() # no file exists — must not raise + + +# ───────────────────────────────────────────────────────────────────────────── +# 3. Delta tokens +# ───────────────────────────────────────────────────────────────────────────── + +class TestDeltaTokens: + + def test_load_returns_empty_when_no_file(self): + assert checkpoint._load_delta_tokens() == {} + + def test_save_then_load(self): + tokens = { + "email:user1": "https://graph.microsoft.com/v1.0/me/mailFolders/delta?$deltaToken=abc", + "onedrive:user1": "https://graph.microsoft.com/v1.0/me/drive/delta?token=xyz", + } + checkpoint._save_delta_tokens(tokens) + loaded = checkpoint._load_delta_tokens() + assert loaded == tokens + + def test_overwrite_preserves_new_value(self): + checkpoint._save_delta_tokens({"key": "old_url"}) + checkpoint._save_delta_tokens({"key": "new_url"}) + assert checkpoint._load_delta_tokens()["key"] == "new_url" + + def test_save_empty_dict(self): + checkpoint._save_delta_tokens({}) + assert checkpoint._load_delta_tokens() == {} diff --git a/tests/test_db.py b/tests/test_db.py new file mode 100644 index 0000000..7b7fbe5 --- /dev/null +++ b/tests/test_db.py @@ -0,0 +1,267 @@ +""" +test_db.py — Tests for gdpr_db.py (ScanDB). + +Covers: + - begin_scan / finish_scan round-trip + - save_item and retrieval + - CPR index stores hash, never plaintext + - lookup_data_subject returns matching items + - set_disposition / get_disposition + - Deletion log + - Export / import cycle (merge and replace modes) +""" +import sys +import hashlib +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).parent.parent)) +from gdpr_db import ScanDB + + +# ───────────────────────────────────────────────────────────────────────────── +# Helpers +# ───────────────────────────────────────────────────────────────────────────── + +def _make_card(item_id="abc123", cpr_count=1, source_type="email", role="staff"): + return { + "id": item_id, + "name": f"{item_id}.docx", + "source": "email", + "source_type": source_type, + "cpr_count": cpr_count, + "url": "https://example.com/item", + "size_kb": 12.5, + "modified": "2024-03-01", + "thumb_b64": "", + "thumb_mime": "image/svg+xml", + "risk": None, + "account_id": "user-1", + "account_name": "Test User", + "user_role": role, + "drive_id": "", + "attachments": [], + "folder": "", + "transfer_risk": "", + "special_category": [], + "face_count": 0, + "exif": {}, + } + + +# ───────────────────────────────────────────────────────────────────────────── +# 1. Scan lifecycle +# ───────────────────────────────────────────────────────────────────────────── + +class TestScanLifecycle: + + def test_begin_scan_returns_int(self, tmp_db): + scan_id = tmp_db.begin_scan({"sources": ["email"], "user_ids": []}) + assert isinstance(scan_id, int) + assert scan_id > 0 + + def test_begin_scan_increments(self, tmp_db): + id1 = tmp_db.begin_scan({"sources": ["email"], "user_ids": []}) + id2 = tmp_db.begin_scan({"sources": ["onedrive"], "user_ids": []}) + assert id2 > id1 + + def test_finish_scan_does_not_raise(self, tmp_db): + scan_id = tmp_db.begin_scan({"sources": ["email"], "user_ids": []}) + tmp_db.finish_scan(scan_id, 42) # must not raise + + def test_multiple_scans_independent(self, tmp_db): + id1 = tmp_db.begin_scan({"sources": ["email"], "user_ids": []}) + tmp_db.save_item(id1, _make_card("item-a"), ["290472-1234"]) + id2 = tmp_db.begin_scan({"sources": ["onedrive"], "user_ids": []}) + tmp_db.save_item(id2, _make_card("item-b"), ["010185-4321"]) + tmp_db.finish_scan(id1, 1) + tmp_db.finish_scan(id2, 1) + + +# ───────────────────────────────────────────────────────────────────────────── +# 2. save_item +# ───────────────────────────────────────────────────────────────────────────── + +class TestSaveItem: + + def test_save_item_does_not_raise(self, tmp_db): + scan_id = tmp_db.begin_scan({"sources": ["email"], "user_ids": []}) + tmp_db.save_item(scan_id, _make_card(), ["290472-1234"]) + + def test_save_item_without_cprs(self, tmp_db): + scan_id = tmp_db.begin_scan({"sources": ["email"], "user_ids": []}) + tmp_db.save_item(scan_id, _make_card(cpr_count=0), []) + + def test_save_multiple_items(self, tmp_db): + scan_id = tmp_db.begin_scan({"sources": ["email"], "user_ids": []}) + for i in range(5): + tmp_db.save_item(scan_id, _make_card(f"item-{i}"), ["290472-1234"]) + + def test_save_item_with_pii_counts(self, tmp_db): + scan_id = tmp_db.begin_scan({"sources": ["email"], "user_ids": []}) + pii = {"cpr": 1, "name": 2, "email": 0} + tmp_db.save_item(scan_id, _make_card(), ["290472-1234"], pii_counts=pii) + + +# ───────────────────────────────────────────────────────────────────────────── +# 3. CPR index — hash only, never plaintext +# ───────────────────────────────────────────────────────────────────────────── + +class TestCprIndex: + + def test_cpr_not_stored_in_plaintext(self, tmp_db): + scan_id = tmp_db.begin_scan({"sources": ["email"], "user_ids": []}) + tmp_db.save_item(scan_id, _make_card(), ["290472-1234"]) + # Read the raw DB and confirm plaintext CPR is absent + import sqlite3 + with sqlite3.connect(tmp_db._path) as con: + rows = con.execute("SELECT cpr_hash FROM cpr_index").fetchall() + assert len(rows) == 1 + stored = rows[0][0] + assert stored != "290472-1234" + assert "290472" not in stored + + def test_cpr_hash_is_sha256(self, tmp_db): + scan_id = tmp_db.begin_scan({"sources": ["email"], "user_ids": []}) + tmp_db.save_item(scan_id, _make_card(), ["290472-1234"]) + import sqlite3 + with sqlite3.connect(tmp_db._path) as con: + rows = con.execute("SELECT cpr_hash FROM cpr_index").fetchall() + stored = rows[0][0] + expected = hashlib.sha256("290472-1234".encode()).hexdigest() + assert stored == expected + + def test_lookup_finds_item(self, tmp_db): + scan_id = tmp_db.begin_scan({"sources": ["email"], "user_ids": []}) + tmp_db.save_item(scan_id, _make_card("item-x"), ["290472-1234"]) + results = tmp_db.lookup_data_subject("290472-1234") + assert len(results) >= 1 + + def test_lookup_returns_correct_item(self, tmp_db): + scan_id = tmp_db.begin_scan({"sources": ["email"], "user_ids": []}) + tmp_db.save_item(scan_id, _make_card("target-item"), ["290472-1234"]) + results = tmp_db.lookup_data_subject("290472-1234") + ids = [r.get("id") or r.get("item_id") for r in results] + assert "target-item" in ids + + def test_lookup_different_cpr_returns_empty(self, tmp_db): + scan_id = tmp_db.begin_scan({"sources": ["email"], "user_ids": []}) + tmp_db.save_item(scan_id, _make_card(), ["290472-1234"]) + results = tmp_db.lookup_data_subject("010185-4321") + assert results == [] + + def test_lookup_multiple_items_for_same_cpr(self, tmp_db): + scan_id = tmp_db.begin_scan({"sources": ["email"], "user_ids": []}) + tmp_db.save_item(scan_id, _make_card("item-a"), ["290472-1234"]) + tmp_db.save_item(scan_id, _make_card("item-b"), ["290472-1234"]) + results = tmp_db.lookup_data_subject("290472-1234") + assert len(results) >= 2 + + +# ───────────────────────────────────────────────────────────────────────────── +# 4. Dispositions +# ───────────────────────────────────────────────────────────────────────────── + +class TestDispositions: + + def test_get_disposition_returns_none_for_unknown(self, tmp_db): + assert tmp_db.get_disposition("nonexistent") is None + + def test_set_and_get_disposition(self, tmp_db): + scan_id = tmp_db.begin_scan({"sources": ["email"], "user_ids": []}) + tmp_db.save_item(scan_id, _make_card("disp-item"), ["290472-1234"]) + tmp_db.set_disposition("disp-item", "retain-legal", "Bogfoeringsloven", "", "admin") + disp = tmp_db.get_disposition("disp-item") + assert disp is not None + assert disp["status"] == "retain-legal" + + def test_disposition_legal_basis_stored(self, tmp_db): + scan_id = tmp_db.begin_scan({"sources": ["email"], "user_ids": []}) + tmp_db.save_item(scan_id, _make_card("disp-2"), []) + tmp_db.set_disposition("disp-2", "delete-scheduled", "Data minimisation", "", "reviewer") + disp = tmp_db.get_disposition("disp-2") + assert disp["legal_basis"] == "Data minimisation" + + def test_disposition_overwrite(self, tmp_db): + scan_id = tmp_db.begin_scan({"sources": ["email"], "user_ids": []}) + tmp_db.save_item(scan_id, _make_card("disp-3"), []) + tmp_db.set_disposition("disp-3", "unreviewed", "", "", "") + tmp_db.set_disposition("disp-3", "deleted", "", "", "admin") + disp = tmp_db.get_disposition("disp-3") + assert disp["status"] == "deleted" + + def test_all_disposition_values_accepted(self, tmp_db): + statuses = ["unreviewed", "retain-legal", "retain-legitimate", + "retain-contract", "delete-scheduled", "deleted"] + scan_id = tmp_db.begin_scan({"sources": ["email"], "user_ids": []}) + for i, status in enumerate(statuses): + item_id = f"disp-status-{i}" + tmp_db.save_item(scan_id, _make_card(item_id), []) + tmp_db.set_disposition(item_id, status, "", "", "test") + disp = tmp_db.get_disposition(item_id) + assert disp["status"] == status + + +# ───────────────────────────────────────────────────────────────────────────── +# 5. Export / import +# ───────────────────────────────────────────────────────────────────────────── + +class TestExportImport: + + def _populate(self, db): + scan_id = db.begin_scan({"sources": ["email"], "user_ids": []}) + db.save_item(scan_id, _make_card("exp-1"), ["290472-1234"]) + db.save_item(scan_id, _make_card("exp-2"), ["010185-4321"]) + db.set_disposition("exp-1", "retain-legal", "Bogfoeringsloven", "", "admin") + db.finish_scan(scan_id, 2) + + def test_export_creates_zip(self, tmp_db, tmp_path): + if not hasattr(tmp_db, "export_db"): + pytest.skip("export_db not implemented") + self._populate(tmp_db) + export_path = tmp_path / "export.zip" + tmp_db.export_db(str(export_path)) + assert export_path.exists() + assert export_path.stat().st_size > 0 + + def test_export_zip_contains_expected_files(self, tmp_db, tmp_path): + if not hasattr(tmp_db, "export_db"): + pytest.skip("export_db not implemented") + self._populate(tmp_db) + export_path = tmp_path / "export.zip" + tmp_db.export_db(str(export_path)) + import zipfile + with zipfile.ZipFile(export_path) as zf: + names = zf.namelist() + for expected in ["export_meta.json", "flagged_items.json", "dispositions.json"]: + assert expected in names + + def test_import_merge_adds_dispositions(self, tmp_path): + if not hasattr(ScanDB, "export_db"): + pytest.skip("export_db not implemented") + # Source DB + src = ScanDB(str(tmp_path / "src.db")) + self._populate(src) + export_path = tmp_path / "export.zip" + src.export_db(str(export_path)) + + # Target DB (fresh) + tgt = ScanDB(str(tmp_path / "tgt.db")) + tgt.import_db(str(export_path), mode="merge") + # Disposition for exp-1 should now exist in tgt + disp = tgt.get_disposition("exp-1") + assert disp is not None + + def test_import_replace_restores_items(self, tmp_path): + if not hasattr(ScanDB, "export_db"): + pytest.skip("export_db not implemented") + src = ScanDB(str(tmp_path / "src2.db")) + self._populate(src) + export_path = tmp_path / "export2.zip" + src.export_db(str(export_path)) + + tgt = ScanDB(str(tmp_path / "tgt2.db")) + tgt.import_db(str(export_path), mode="replace") + results = tgt.lookup_data_subject("290472-1234") + assert len(results) >= 1 diff --git a/tests/test_document_scanner.py b/tests/test_document_scanner.py new file mode 100644 index 0000000..dcc8f97 --- /dev/null +++ b/tests/test_document_scanner.py @@ -0,0 +1,224 @@ +""" +test_document_scanner.py — Tests for CPR detection in document_scanner.py. + +Covers: + - extract_matches: context-gated CPR detection + - is_valid_cpr: date validation and modulo-11 + - scan_docx: CPR detection in Word documents (including table cells) + - scan_xlsx: CPR detection in Excel cells with context + - False-positive suppression (invoices, phone numbers, account numbers) +""" +import sys +import tempfile +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).parent.parent)) +import document_scanner as ds + + +# ───────────────────────────────────────────────────────────────────────────── +# Helpers +# ───────────────────────────────────────────────────────────────────────────── + +def _cprs(text: str) -> list: + """Return list of CPR dicts found in text via extract_matches.""" + found, _ = ds.extract_matches(text, 1, "test") + return found + + +def _has_cpr(text: str) -> bool: + return bool(_cprs(text)) + + +# ───────────────────────────────────────────────────────────────────────────── +# 1. Date validation — is_valid_cpr +# ───────────────────────────────────────────────────────────────────────────── + +class TestIsValidCpr: + def test_valid_date_returns_true(self): + valid, _ = ds.is_valid_cpr("29", "04", "72", "1234") + assert valid is True + + def test_invalid_month_returns_false(self): + valid, _ = ds.is_valid_cpr("01", "13", "70", "1234") + assert valid is False + + def test_invalid_day_zero_returns_false(self): + valid, _ = ds.is_valid_cpr("00", "01", "70", "1234") + assert valid is False + + def test_invalid_day_32_returns_false(self): + valid, _ = ds.is_valid_cpr("32", "01", "70", "1234") + assert valid is False + + def test_february_31_invalid(self): + valid, _ = ds.is_valid_cpr("31", "02", "90", "1234") + assert valid is False + + def test_returns_tuple_of_two(self): + result = ds.is_valid_cpr("01", "01", "70", "1234") + assert isinstance(result, tuple) + assert len(result) == 2 + + def test_mod11_field_is_bool(self): + _, mod11 = ds.is_valid_cpr("01", "01", "70", "1234") + assert isinstance(mod11, bool) + + +# ───────────────────────────────────────────────────────────────────────────── +# 2. extract_matches — context-gated detection +# ───────────────────────────────────────────────────────────────────────────── + +class TestExtractMatches: + + # ── Should detect ───────────────────────────────────────────────────────── + + def test_detects_cpr_with_label(self): + assert _has_cpr("CPR: 290472-1234") + + def test_detects_cpr_uppercase_label(self): + assert _has_cpr("CPR-nummer: 290472-1234") + + def test_detects_personnummer_keyword(self): + assert _has_cpr("personnummer 010185-4321") + + def test_detects_no_separator(self): + assert _has_cpr("cpr nummer 2904721234") + + def test_detects_space_separator(self): + assert _has_cpr("CPR 290472 1234") + + def test_result_contains_formatted_field(self): + cprs = _cprs("CPR: 290472-1234") + assert cprs[0]["formatted"] == "290472-1234" + + def test_result_contains_raw_field(self): + cprs = _cprs("CPR: 290472-1234") + assert "raw" in cprs[0] + + def test_multiple_cprs_returned(self): + text = "CPR: 290472-1234 og personnummer 010185-4321" + cprs = _cprs(text) + assert len(cprs) == 2 + + # ── Should NOT detect ───────────────────────────────────────────────────── + + def test_rejects_naked_number_without_context(self): + # No context keyword and no mod-11 — should be suppressed + assert not _has_cpr("2904721234") + + def test_rejects_phone_number_8_digits(self): + assert not _has_cpr("ring 12345678 for info") + + def test_rejects_invoice_context(self): + assert not _has_cpr("faktura nr 290472-1234") + + def test_rejects_part_number_context(self): + assert not _has_cpr("del nr. 290472-1234") + + def test_rejects_invalid_date(self): + # Month 13 — date invalid, should not appear + assert not _has_cpr("CPR: 011370-1234") + + def test_empty_string(self): + assert not _has_cpr("") + + def test_plain_prose_no_numbers(self): + assert not _has_cpr("Ingen personoplysninger i denne tekst.") + + +# ───────────────────────────────────────────────────────────────────────────── +# 3. scan_docx +# ───────────────────────────────────────────────────────────────────────────── + +class TestScanDocx: + + def test_detects_cpr_in_paragraph(self, docx_with_cpr): + result = ds.scan_docx(docx_with_cpr) + assert len(result["cprs"]) >= 1 + + def test_detects_multiple_cprs(self, docx_with_cpr): + result = ds.scan_docx(docx_with_cpr) + assert len(result["cprs"]) >= 2 + + def test_detects_cpr_in_table_cell(self, docx_with_cpr): + result = ds.scan_docx(docx_with_cpr) + # Fixture: 2 CPRs in paragraphs + 1 in a table cell (with context) + assert len(result["cprs"]) >= 3 + + def test_no_false_positive_on_clean_doc(self, docx_no_cpr): + result = ds.scan_docx(docx_no_cpr) + assert result["cprs"] == [] + + def test_returns_cprs_key(self, docx_with_cpr): + result = ds.scan_docx(docx_with_cpr) + assert "cprs" in result + + def test_no_error_on_clean_doc(self, docx_no_cpr): + result = ds.scan_docx(docx_no_cpr) + assert result.get("error") is None + + +# ───────────────────────────────────────────────────────────────────────────── +# 4. scan_xlsx +# ───────────────────────────────────────────────────────────────────────────── + +class TestScanXlsx: + + def test_detects_cpr_in_cell_with_context(self, xlsx_with_cpr): + result = ds.scan_xlsx(xlsx_with_cpr) + assert len(result["cprs"]) >= 1 + + def test_no_false_positive_on_account_numbers(self, xlsx_no_cpr): + result = ds.scan_xlsx(xlsx_no_cpr) + assert result["cprs"] == [] + + def test_returns_cprs_key(self, xlsx_with_cpr): + result = ds.scan_xlsx(xlsx_with_cpr) + assert "cprs" in result + + +# ───────────────────────────────────────────────────────────────────────────── +# 5. Binary / edge cases via cpr_detector._scan_bytes +# ───────────────────────────────────────────────────────────────────────────── + +class TestScanBytes: + + def test_binary_garbage_does_not_crash(self, binary_garbage): + import cpr_detector + data = binary_garbage.read_bytes() + result = cpr_detector._scan_bytes(data, "sample.bin") + assert isinstance(result, dict) + assert "cprs" in result + + def test_empty_bytes_returns_empty(self): + import cpr_detector + result = cpr_detector._scan_bytes(b"", "empty.txt") + assert result["cprs"] == [] + + def test_txt_with_cpr_detected(self, txt_with_art9): + import cpr_detector, document_scanner as ds + # scan_text in document_scanner calls undefined extract_cpr_and_dates; + # test the underlying extract_matches directly on the file content. + text = txt_with_art9.read_text(encoding='utf-8') + cprs, _ = ds.extract_matches(text, 1, 'test') + assert len(cprs) >= 1 + + def test_docx_with_cpr_via_scan_bytes(self, docx_with_cpr): + import cpr_detector + data = docx_with_cpr.read_bytes() + result = cpr_detector._scan_bytes(data, "sample.docx") + assert len(result["cprs"]) >= 1 + + def test_xlsx_with_cpr_via_scan_bytes(self, xlsx_with_cpr): + import cpr_detector + data = xlsx_with_cpr.read_bytes() + result = cpr_detector._scan_bytes(data, "sample.xlsx") + assert len(result["cprs"]) >= 1 + + def test_unsupported_extension_does_not_crash(self): + import cpr_detector + result = cpr_detector._scan_bytes(b"some bytes", "file.xyz") + assert isinstance(result, dict) diff --git a/tests/test_routes.py b/tests/test_routes.py new file mode 100644 index 0000000..d909652 --- /dev/null +++ b/tests/test_routes.py @@ -0,0 +1,277 @@ +""" +Integration tests for Flask routes — uses the real Flask test client. + +Strategy +-------- +- ``flask_app`` (module-scope) — imports gdpr_scanner once, enables TESTING mode. +- ``client`` (function-scope) — fresh test_client() per test. +- ``db_patch`` (function-scope) — replaces routes.database._get_db with a ScanDB + backed by a tmp_path so tests never touch ~/.gdprscanner. + Also sets routes.database.DB_OK = True. +- ``mock_connector`` — sets routes.state.connector to a MagicMock so routes + that require authentication pass the ``if not state.connector`` + guard. +- ``clean_state`` — autouse, resets routes.state.flagged_items and ensures the + scan lock is released between tests. +""" +import io +import threading +import time +from unittest.mock import MagicMock + +import pytest + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +@pytest.fixture(scope="module") +def flask_app(): + import gdpr_scanner + gdpr_scanner.app.config["TESTING"] = True + gdpr_scanner.app.config["WTF_CSRF_ENABLED"] = False + return gdpr_scanner.app + + +@pytest.fixture() +def client(flask_app): + with flask_app.test_client() as c: + yield c + + +@pytest.fixture() +def db_patch(tmp_path, monkeypatch): + """Point routes.database and routes.export _get_db at a fresh ScanDB in a temp dir.""" + from gdpr_db import ScanDB + import routes.database, routes.export + db = ScanDB(str(tmp_path / "test.db")) + monkeypatch.setattr(routes.database, "_get_db", lambda: db) + monkeypatch.setattr(routes.database, "DB_OK", True) + monkeypatch.setattr(routes.export, "_get_db", lambda: db) + monkeypatch.setattr(routes.export, "DB_OK", True) + return db + + +@pytest.fixture() +def mock_connector(monkeypatch): + """Satisfy the connector guard in scan routes. + + /api/scan/start is now handled exclusively by the blueprint (routes/scan.py), + which checks ``state.connector``. Patching state.connector is sufficient. + """ + from routes import state + conn = MagicMock() + monkeypatch.setattr(state, "connector", conn) + return conn + + +@pytest.fixture(autouse=True) +def clean_state(): + """Wipe in-memory scan state and ensure the scan lock is free after each test.""" + from routes import state + yield + # Clear in-memory results so export tests don't bleed into each other + state.flagged_items.clear() + # Release the lock if a test left it held (e.g. a failed scan-start test) + if not state._scan_lock.acquire(blocking=False): + pass # still held — leave it; the test that set it is responsible + else: + state._scan_lock.release() + + +# --------------------------------------------------------------------------- +# /api/scan/status +# --------------------------------------------------------------------------- + +class TestScanStatus: + def test_idle_returns_not_running(self, client): + r = client.get("/api/scan/status") + assert r.status_code == 200 + data = r.get_json() + assert data["running"] is False + + def test_scan_id_is_none_when_idle(self, client): + r = client.get("/api/scan/status") + data = r.get_json() + assert "scan_id" in data + assert data["scan_id"] is None + + +# --------------------------------------------------------------------------- +# /api/scan/start +# --------------------------------------------------------------------------- + +class TestScanStart: + def test_unauthenticated_returns_401(self, client, monkeypatch): + from routes import state + monkeypatch.setattr(state, "connector", None) + r = client.post("/api/scan/start", json={}) + assert r.status_code == 401 + assert "not authenticated" in r.get_json()["error"] + + def test_lock_held_returns_409(self, client, mock_connector): + from routes import state + # Hold the lock as if a scan were already running + acquired = state._scan_lock.acquire(blocking=False) + assert acquired, "Lock should be free at test start" + try: + r = client.post("/api/scan/start", json={}) + assert r.status_code == 409 + assert "already running" in r.get_json()["error"] + finally: + state._scan_lock.release() + + def test_authenticated_returns_started(self, client, mock_connector, monkeypatch): + import scan_engine + from routes import state + # Stub run_scan so the background thread finishes instantly + monkeypatch.setattr(scan_engine, "run_scan", lambda opts: None) + r = client.post("/api/scan/start", json={"sources": ["email"]}) + assert r.status_code == 200 + assert r.get_json()["status"] == "started" + # Give the background thread time to release the lock + deadline = time.time() + 2.0 + while not state._scan_lock.acquire(blocking=False): + assert time.time() < deadline, "scan lock was never released" + time.sleep(0.05) + state._scan_lock.release() + + +# --------------------------------------------------------------------------- +# /api/scan/stop +# --------------------------------------------------------------------------- + +class TestScanStop: + def test_stop_always_returns_200(self, client): + r = client.post("/api/scan/stop") + assert r.status_code == 200 + assert r.get_json()["status"] == "stopping" + + +# --------------------------------------------------------------------------- +# /api/db/stats +# --------------------------------------------------------------------------- + +class TestDbStats: + def test_without_db_returns_503(self, client, monkeypatch): + import routes.database + monkeypatch.setattr(routes.database, "DB_OK", False) + r = client.get("/api/db/stats") + assert r.status_code == 503 + + def test_with_db_returns_200(self, client, db_patch): + # The direct route in gdpr_scanner.py (which takes precedence over the + # blueprint) returns get_stats() directly — an empty dict for a fresh DB. + r = client.get("/api/db/stats") + assert r.status_code == 200 + assert isinstance(r.get_json(), dict) + + +# --------------------------------------------------------------------------- +# /api/db/disposition +# --------------------------------------------------------------------------- + +class TestDisposition: + def test_set_disposition_missing_item_id_returns_400(self, client, db_patch): + r = client.post("/api/db/disposition", json={"status": "retain-legal"}) + assert r.status_code == 400 + assert "item_id" in r.get_json()["error"] + + def test_set_disposition_saves_and_get_returns_it(self, client, db_patch): + item_id = "test-item-abc123" + + # Set + r = client.post("/api/db/disposition", json={ + "item_id": item_id, + "status": "retain-legal", + "legal_basis": "GDPR Art. 6(1)(c)", + "notes": "Required by law", + }) + assert r.status_code == 200 + assert r.get_json()["status"] == "saved" + + # Get + r2 = client.get(f"/api/db/disposition/{item_id}") + assert r2.status_code == 200 + data = r2.get_json() + assert data["status"] == "retain-legal" + + def test_get_disposition_unknown_id_returns_unreviewed(self, client, db_patch): + r = client.get("/api/db/disposition/no-such-item") + assert r.status_code == 200 + assert r.get_json()["status"] == "unreviewed" + + def test_without_db_returns_503(self, client, monkeypatch): + import routes.database + monkeypatch.setattr(routes.database, "DB_OK", False) + r = client.post("/api/db/disposition", + json={"item_id": "x", "status": "retain-legal"}) + assert r.status_code == 503 + + +# --------------------------------------------------------------------------- +# /api/export_excel +# --------------------------------------------------------------------------- + +class TestExportExcel: + XLSX_MIME = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + + def test_empty_db_returns_workbook(self, client, db_patch): + r = client.get("/api/export_excel") + assert r.status_code == 200 + assert self.XLSX_MIME in r.content_type + # Must be a valid zip/xlsx (PK magic bytes) + assert r.data[:2] == b"PK" + + def test_with_items_in_memory_includes_data(self, client, db_patch): + from routes import state + state.flagged_items.append({ + "id": "item-001", + "name": "test_file.docx", + "source": "onedrive", + "cpr_count": 2, + "face_count": 0, + "account_name": "Anna Hansen", + "user_role": "staff", + "modified": "2025-01-15T10:00:00", + "size_kb": 42, + "url": "https://example.com/file", + }) + r = client.get("/api/export_excel") + assert r.status_code == 200 + assert r.data[:2] == b"PK" + # Workbook with data is larger than a skeleton workbook + assert len(r.data) > 4096 + + +# --------------------------------------------------------------------------- +# /api/export_article30 +# --------------------------------------------------------------------------- + +class TestExportArticle30: + DOCX_MIME = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + + def test_no_items_returns_400(self, client, db_patch): + """Article 30 export requires at least one flagged item.""" + r = client.get("/api/export_article30") + assert r.status_code == 400 + assert "scan first" in r.get_json()["error"].lower() + + def test_with_items_returns_docx(self, client, db_patch): + from routes import state + state.flagged_items.append({ + "id": "item-002", + "name": "payroll.xlsx", + "source": "email", + "cpr_count": 1, + "account_name": "Test User", + "user_role": "staff", + "modified": "2025-03-01T09:00:00", + "size_kb": 10, + }) + r = client.get("/api/export_article30") + assert r.status_code == 200 + assert self.DOCX_MIME in r.content_type + # DOCX is a zip — check PK magic bytes + assert r.data[:2] == b"PK"