GDPRScanner/install_macos.sh
2026-04-11 04:38:11 +02:00

424 lines
17 KiB
Bash
Executable File

#!/usr/bin/env bash
# ══════════════════════════════════════════════════════════════════════════════
# Document Scanner — macOS Installation Script
# ══════════════════════════════════════════════════════════════════════════════
# Installs all dependencies for document_scanner.py, server.py, build.py,
# gdpr_scanner.py and m365_connector.py:
# - Homebrew (if not present)
# - Python 3.11 or 3.12 (3.13+ blocked — spaCy incompatible)
# - Tesseract OCR with Danish + English language packs
# - Poppler (required by pdf2image for PDF rendering)
# - A virtualenv at ./venv with all Python packages
# - spaCy Danish NER model (~500 MB)
#
# All Python packages are installed into a virtualenv (./venv) to avoid the
# "externally-managed-environment" error from Homebrew Python 3.12+.
#
# Usage:
# chmod +x install_macos.sh && ./install_macos.sh
# ══════════════════════════════════════════════════════════════════════════════
set -euo pipefail
# ── Colours ───────────────────────────────────────────────────────────────────
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
CYAN='\033[0;36m'; BOLD='\033[1m'; RESET='\033[0m'
step() { echo -e "\n${CYAN}==> $1${RESET}"; }
ok() { echo -e " ${GREEN}[OK]${RESET} $1"; }
warn() { echo -e " ${YELLOW}[!!]${RESET} $1"; }
fail() { echo -e " ${RED}[XX]${RESET} $1"; exit 1; }
# Where the virtualenv will live — next to this script
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
VENV_DIR="$SCRIPT_DIR/venv"
echo ""
echo -e "${BOLD} Document Scanner — macOS Setup${RESET}"
echo " -----------------------------------------"
echo ""
# ── 0. Detect architecture ────────────────────────────────────────────────────
ARCH=$(uname -m)
if [[ "$ARCH" == "arm64" ]]; then
BREW_PREFIX="/opt/homebrew"
ok "Apple Silicon (M-series) — Homebrew prefix: $BREW_PREFIX"
else
BREW_PREFIX="/usr/local"
ok "Intel Mac — Homebrew prefix: $BREW_PREFIX"
fi
# ── 1. Install Homebrew ───────────────────────────────────────────────────────
step "Checking Homebrew"
if command -v brew &>/dev/null; then
ok "Homebrew already installed: $(brew --version | head -1)"
else
echo " Installing Homebrew..."
/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
eval "$($BREW_PREFIX/bin/brew shellenv)"
ok "Homebrew installed"
fi
eval "$($BREW_PREFIX/bin/brew shellenv)" 2>/dev/null || true
# ── 2. Find or install Python 3.11 / 3.12 ────────────────────────────────────
# Homebrew Python 3.12+ is "externally managed" — pip installs must go into
# a virtualenv. We find a compatible base interpreter here; all packages will
# be installed into ./venv below, not into the system interpreter.
step "Checking Python (need 3.11 or 3.12 — spaCy incompatible with 3.13+)"
find_compatible_python() {
for cmd in \
"$BREW_PREFIX/bin/python3.12" \
"$BREW_PREFIX/bin/python3.11" \
python3.12 python3.11 python3 python; do
if command -v "$cmd" &>/dev/null 2>&1; then
local ver maj min
ver=$("$cmd" --version 2>&1 | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' | head -1)
maj=$(echo "$ver" | cut -d. -f1)
min=$(echo "$ver" | cut -d. -f2)
if [[ "$maj" == "3" ]] && { [[ "$min" == "11" ]] || [[ "$min" == "12" ]]; }; then
echo "$cmd"
return 0
fi
fi
done
return 1
}
BASE_PYTHON=""
if BASE_PYTHON=$(find_compatible_python); then
ok "Compatible Python: $($BASE_PYTHON --version 2>&1) ($BASE_PYTHON)"
else
if command -v python3 &>/dev/null; then
EXISTING=$(python3 --version 2>&1 | grep -oE '[0-9]+\.[0-9]+' | head -1)
EXIST_MIN=$(echo "$EXISTING" | cut -d. -f2)
if [[ "$EXIST_MIN" -ge 13 ]]; then
warn "Python $EXISTING is too new (spaCy requires ≤ 3.12)"
fi
fi
echo " Installing Python 3.12 via Homebrew..."
brew install python@3.12
BASE_PYTHON="$BREW_PREFIX/bin/python3.12"
if [[ ! -x "$BASE_PYTHON" ]]; then
echo " python3.12 not found, trying python3.11..."
brew install python@3.11
BASE_PYTHON="$BREW_PREFIX/bin/python3.11"
fi
[[ -x "$BASE_PYTHON" ]] || fail "Python install failed. Try: brew install python@3.12"
ok "Python installed: $($BASE_PYTHON --version 2>&1)"
fi
# Confirm version
$BASE_PYTHON --version 2>&1 | grep -qE 'Python 3\.(11|12)' \
|| fail "Unexpected version: $($BASE_PYTHON --version 2>&1)"
# ── 3. Create virtualenv ──────────────────────────────────────────────────────
step "Setting up virtualenv at $VENV_DIR"
if [[ -d "$VENV_DIR" && -x "$VENV_DIR/bin/python" ]]; then
# Validate it was built with a compatible interpreter
VENV_VER=$("$VENV_DIR/bin/python" --version 2>&1 | grep -oE '[0-9]+\.[0-9]+' | head -1)
VENV_MIN=$(echo "$VENV_VER" | cut -d. -f2)
if [[ "$VENV_MIN" == "11" || "$VENV_MIN" == "12" ]]; then
ok "Existing virtualenv is compatible (Python $VENV_VER) — reusing"
else
warn "Existing virtualenv uses Python $VENV_VER — rebuilding"
rm -rf "$VENV_DIR"
$BASE_PYTHON -m venv "$VENV_DIR"
ok "Virtualenv rebuilt"
fi
else
$BASE_PYTHON -m venv "$VENV_DIR"
ok "Virtualenv created"
fi
# All subsequent Python/pip commands use the venv
PYTHON="$VENV_DIR/bin/python"
PIP="$PYTHON -m pip"
# Upgrade pip inside the venv (no restrictions here)
echo " Upgrading pip..."
$PIP install --upgrade pip --quiet
ok "pip up to date: $($PIP --version)"
# ── 4. Install Tesseract OCR ──────────────────────────────────────────────────
step "Installing Tesseract OCR + language packs"
if brew list tesseract &>/dev/null 2>&1; then
ok "Tesseract already installed: $(tesseract --version 2>&1 | head -1)"
else
brew install tesseract
ok "Tesseract installed: $(tesseract --version 2>&1 | head -1)"
fi
if brew list tesseract-lang &>/dev/null 2>&1; then
ok "Tesseract language packs already installed"
else
echo " Installing tesseract-lang (~300 MB)..."
brew install tesseract-lang
ok "Language packs installed"
fi
if tesseract --list-langs 2>&1 | grep -q "^dan$"; then
ok "Danish (dan) OCR available"
else
warn "Danish language pack not found — try: brew reinstall tesseract-lang"
fi
# ── 5. Install Poppler ────────────────────────────────────────────────────────
step "Installing Poppler (required for PDF rendering)"
if brew list poppler &>/dev/null 2>&1; then
ok "Poppler already installed"
else
brew install poppler
ok "Poppler installed"
fi
command -v pdftoppm &>/dev/null \
&& ok "pdftoppm: $(which pdftoppm)" \
|| warn "pdftoppm not on PATH — launcher will probe Homebrew paths automatically"
# ── 6. Install Python packages into venv ─────────────────────────────────────
step "Installing Python packages into virtualenv"
packages=(
"flask"
"pdfplumber"
"pdf2image"
"pytesseract"
"pypdf"
"reportlab"
"python-docx"
"openpyxl"
"img2pdf"
"opencv-python-headless"
"numpy"
"Pillow"
"spacy"
"py7zr"
"pymupdf"
"pywebview"
"pystray"
"pyinstaller"
"pyinstaller-hooks-contrib"
# GDPRScanner
"msal"
"requests"
# Optional — File system scanning (#8)
# smbprotocol: native SMB2/3 without mounting (needed for network share scanning)
# keyring: OS keychain credential storage for SMB passwords
# python-dotenv: .env file fallback for headless SMB credentials
"smbprotocol"
"keyring"
"python-dotenv"
# Scheduler (#19)
"APScheduler"
# Google Workspace scanning (#10)
"google-auth"
"google-auth-httplib2"
"google-api-python-client"
)
failed=()
for pkg in "${packages[@]}"; do
printf " %-36s" "$pkg..."
if $PIP install "$pkg" --quiet --disable-pip-version-check 2>/dev/null; then
echo -e "${GREEN}OK${RESET}"
else
echo -e "${RED}FAILED${RESET}"
failed+=("$pkg")
fi
done
if [[ ${#failed[@]} -gt 0 ]]; then
warn "Failed: ${failed[*]}"
warn "Retry: $PIP install ${failed[*]}"
fi
# ── 7. Install create-dmg ─────────────────────────────────────────────────────
step "Checking create-dmg (optional — for .dmg packaging)"
if command -v create-dmg &>/dev/null; then
ok "create-dmg already installed"
else
brew install create-dmg 2>/dev/null \
&& ok "create-dmg installed" \
|| warn "create-dmg unavailable — install manually: brew install create-dmg"
fi
# ── 8. Install spaCy Danish NER model ─────────────────────────────────────────
step "Installing spaCy Danish NER model (~500 MB)"
# spaCy's download command uses shutil.which("pip") to find a package
# installer. Inside a venv the wrapper may be named pip3 only. Ensure a
# `pip` executable exists so spaCy can find it.
if [[ ! -x "$VENV_DIR/bin/pip" ]]; then
echo " Creating pip wrapper in venv (needed by spaCy download)…"
cat > "$VENV_DIR/bin/pip" << 'PIPSHIM'
#!/usr/bin/env bash
exec "$(dirname "$0")/python3" -m pip "$@"
PIPSHIM
chmod +x "$VENV_DIR/bin/pip"
fi
# Verify pip is now visible
if "$VENV_DIR/bin/pip" --version &>/dev/null; then
ok "pip available: $("$VENV_DIR/bin/pip" --version 2>&1)"
else
warn "pip wrapper not working — will use direct pip install fallback"
fi
if $PYTHON -c "import da_core_news_lg" &>/dev/null 2>&1; then
ok "spaCy Danish model already installed"
else
installed=false
for model in da_core_news_lg da_core_news_md da_core_news_sm; do
echo " Trying $model..."
# Method 1: spacy download with venv/bin explicitly on PATH
# (spaCy uses shutil.which("pip") which searches PATH)
if PATH="$VENV_DIR/bin:$PATH" $PYTHON -m spacy download "$model" 2>/dev/null; then
ok "Installed: $model (via spacy download)"
installed=true
break
fi
# Method 2: direct pip install — spaCy models are regular PyPI packages
echo " spacy download failed — trying pip install..."
if $PIP install "$model" 2>&1; then
if $PYTHON -c "import ${model//-/_}" &>/dev/null 2>&1; then
ok "Installed: $model (via pip)"
installed=true
break
else
warn "$model pip install reported success but import failed"
fi
fi
done
if [[ "$installed" == false ]]; then
warn "No spaCy model installed — anonymisation unavailable"
warn "Retry manually: $PIP install da_core_news_sm"
fi
fi
# ── 9. Verify ─────────────────────────────────────────────────────────────────
step "Verifying installation"
ok "Python (venv): $($PYTHON --version 2>&1)"
ok "Tesseract: $(tesseract --version 2>&1 | head -1)"
ok "Poppler: $(pdftoppm -v 2>&1 | head -1 || echo 'available via Homebrew PATH')"
$PYTHON - <<'PYCHECK'
import sys
checks = [
('flask', 'flask'),
('pdfplumber', 'pdfplumber'),
('pdf2image', 'pdf2image'),
('pytesseract', 'pytesseract'),
('pypdf', 'pypdf'),
('reportlab', 'reportlab'),
('python-docx', 'docx'),
('openpyxl', 'openpyxl'),
('opencv-python-headless', 'cv2'),
('numpy', 'numpy'),
('Pillow', 'PIL'),
('spacy', 'spacy'),
('img2pdf', 'img2pdf'),
('pywebview', 'webview'),
('pystray', 'pystray'),
('PyInstaller', 'PyInstaller'),
('py7zr', 'py7zr'),
# GDPRScanner
('msal', 'msal'),
('requests', 'requests'),
]
optional_checks = [
('smbprotocol', 'smbprotocol', 'SMB/CIFS network share scanning'),
('keyring', 'keyring', 'OS keychain credential storage'),
('python-dotenv', 'dotenv', '.env file credential fallback'),
('APScheduler', 'apscheduler', 'In-process scheduled scans'),
]
missing = []
for name, imp in checks:
try:
__import__(imp)
print(f' \033[32m[OK]\033[0m {name}')
except ImportError:
print(f' \033[31m[!!]\033[0m {name} MISSING')
missing.append(name)
print('\n Optional (file system scanning):')
for name, imp, desc in optional_checks:
try:
__import__(imp)
print(f' \033[32m[OK]\033[0m {name} — {desc}')
except ImportError:
print(f' \033[33m[--]\033[0m {name} — {desc} (not installed)')
if missing:
print(f'\n Missing: {", ".join(missing)}')
sys.exit(1)
print('\n All packages verified.')
PYCHECK
ALL_OK=$?
# ── 10. Shell profile ─────────────────────────────────────────────────────────
step "Shell PATH configuration"
SHELL_RC=""
if [[ "$SHELL" == *"zsh"* ]]; then SHELL_RC="$HOME/.zshrc"; fi
if [[ "$SHELL" == *"bash"* ]]; then SHELL_RC="$HOME/.bash_profile"; fi
if [[ -n "$SHELL_RC" ]]; then
if grep -q "brew shellenv" "$SHELL_RC" 2>/dev/null; then
ok "Homebrew already configured in $SHELL_RC"
else
echo "" >> "$SHELL_RC"
echo "# Homebrew" >> "$SHELL_RC"
echo "eval \"\$($BREW_PREFIX/bin/brew shellenv)\"" >> "$SHELL_RC"
ok "Homebrew added to $SHELL_RC — restart Terminal or: source $SHELL_RC"
fi
fi
# ── 11. Create launch scripts ─────────────────────────────────────────────────
step "Creating launch scripts"
# start_gdpr.sh — launches GDPRScanner
cat > "$SCRIPT_DIR/start_gdpr.sh" << M365EOF
#!/usr/bin/env bash
# GDPRScanner — launch script (uses ./venv)
SCRIPT_DIR="\$(cd "\$(dirname "\${BASH_SOURCE[0]}")" && pwd)"
source "\$SCRIPT_DIR/venv/bin/activate"
exec python3 "\$SCRIPT_DIR/gdpr_scanner.py" "\${@}"
M365EOF
chmod +x "$SCRIPT_DIR/start_gdpr.sh"
ok "Created: start_gdpr.sh"
# build_gdpr.sh — builds standalone GDPRScanner .app
cat > "$SCRIPT_DIR/build_gdpr.sh" << BLD365EOF
#!/usr/bin/env bash
# GDPRScanner — build .app (uses ./venv)
SCRIPT_DIR="\$(cd "\$(dirname "\${BASH_SOURCE[0]}")" && pwd)"
source "\$SCRIPT_DIR/venv/bin/activate"
exec python3 "\$SCRIPT_DIR/build_gdpr.py" --clean "\$@"
BLD365EOF
chmod +x "$SCRIPT_DIR/build_gdpr.sh"
ok "Created: build_gdpr.sh"
# ── Done ──────────────────────────────────────────────────────────────────────
echo ""
echo " -----------------------------------------"
[[ $ALL_OK -eq 0 ]] \
&& echo -e " ${GREEN}${BOLD}Installation complete!${RESET}" \
|| echo -e " ${YELLOW}${BOLD}Installation complete with warnings — see above${RESET}"
echo ""
echo -e " ${BOLD}GDPRScanner:${RESET}"
echo -e " ${CYAN}./start_gdpr.sh${RESET}"
echo " Then open: http://127.0.0.1:5100"
echo ""
echo -e " ${BOLD}File system scanning (optional):${RESET}"
echo -e " ${CYAN}./start_gdpr.sh --scan-path ~/Documents${RESET}"
echo -e " ${CYAN}./start_gdpr.sh --scan-path //nas/shares --smb-user 'DOMAIN\\user'${RESET}"
echo " Or use the '📁 File sources' panel in the GDPRScanner UI"
echo ""
echo -e " ${BOLD}Build standalone app:${RESET}"
echo -e " ${CYAN}./build_gdpr.sh${RESET} → dist/GDPRScanner.app"
echo ""
echo " -----------------------------------------"
echo ""