GDPRScanner/routes/export.py
2026-04-25 08:48:54 +02:00

1267 lines
60 KiB
Python

"""
Excel and Article 30 export, bulk delete
"""
from __future__ import annotations
import json, io, re, traceback, logging
from pathlib import Path
from flask import Blueprint, Response, jsonify, request
from routes import state
from app_config import _GUID_RE, _resolve_display_name
try:
from gdpr_db import get_db as _get_db
DB_OK = True
except ImportError:
DB_OK = False
def _get_db(*a, **kw): return None # type: ignore[misc]
try:
from m365_connector import M365PermissionError
except ImportError:
class M365PermissionError(Exception): pass # type: ignore[no-redef]
bp = Blueprint("export", __name__)
logger = logging.getLogger(__name__)
def _build_excel_bytes(role: str = "") -> tuple[bytes, str]:
"""Build the M365 scan Excel workbook and return (bytes, filename).
Raises on error. Used by export_excel() and send_report().
role: '' = all, 'student' = students only, 'staff' = staff + other."""
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl.utils import get_column_letter
HEADER_BG = "1F3864"
HEADER_FG = "FFFFFF"
ALT_BG = "EEF2FF"
SOURCE_MAP = {
"email": ("📧 Outlook", "D6E4F7"),
"onedrive": ("💾 OneDrive", "D6F7E4"),
"sharepoint": ("🌐 SharePoint", "FFF0D6"),
"teams": ("💬 Teams", "F7D6F0"),
"gmail": ("📧 Gmail", "D6EAF8"),
"gdrive": ("💾 Google Drive", "D5F5E3"),
"local": ("📁 Local", "E6F7E6"),
"smb": ("🌐 Network", "E0F0FA"),
"sftp": ("🔒 SFTP", "EDE9F7"),
}
COLS = [
("Name / Subject", 45),
("CPR Hits", 9),
("Face count", 9),
("GPS", 6),
("Special category", 22),
("EXIF author", 18),
("Folder", 30),
("Account", 24),
("Role", 10),
("Disposition", 18),
("Date Modified", 14),
("Size (KB)", 10),
("URL", 50),
]
thin = Side(style="thin", color="CCCCCC")
border = Border(left=thin, right=thin, top=thin, bottom=thin)
def _fill(hex_col):
return PatternFill("solid", fgColor=hex_col)
def _write_sheet(ws, rows, tab_color):
ws.sheet_properties.tabColor = tab_color
for col_idx, (col_name, col_w) in enumerate(COLS, 1):
cell = ws.cell(row=1, column=col_idx, value=col_name)
cell.font = Font(name="Arial", bold=True, color=HEADER_FG, size=10)
cell.fill = _fill(HEADER_BG)
cell.alignment = Alignment(horizontal="center", vertical="center", wrap_text=True)
cell.border = border
ws.column_dimensions[get_column_letter(col_idx)].width = col_w
ws.row_dimensions[1].height = 20
ws.freeze_panes = "A2"
for r_idx, item in enumerate(rows, 2):
row_fill = _fill(ALT_BG if r_idx % 2 == 0 else "FFFFFF")
_disp = ""
if DB_OK:
try:
_d = _get_db().get_disposition(item.get("id", ""))
_disp = (_d.get("status", "") if _d else "")
except Exception:
pass
_sc = item.get("special_category", [])
_sc_str = ", ".join(
s for s in (_sc if isinstance(_sc, list) else [str(_sc or "")])
if s not in ("gps_location", "exif_pii")
)
_exif = item.get("exif") or {}
_gps = _exif.get("gps")
_author = _exif.get("author") or ""
values = [
item.get("name", ""),
item.get("cpr_count", 0),
item.get("face_count", 0),
"" if _gps else "",
_sc_str,
_author,
item.get("folder", ""),
item.get("account_name", "") or item.get("source", ""),
item.get("user_role", ""),
_disp,
item.get("modified", ""),
item.get("size_kb", ""),
item.get("url", ""),
]
for col_idx, val in enumerate(values, 1):
is_url = col_idx == 13 and val
cell = ws.cell(row=r_idx, column=col_idx, value=val)
cell.font = Font(name="Arial", size=10,
color="1155CC" if is_url else "000000",
underline="single" if is_url else None)
cell.fill = row_fill
cell.alignment = Alignment(vertical="center", wrap_text=(col_idx == 1))
cell.border = border
ws.row_dimensions[r_idx].height = 16
if rows:
tr = len(rows) + 2
ws.cell(row=tr, column=1, value="Total").font = Font(name="Arial", bold=True, size=10)
ws.cell(row=tr, column=2, value=f"=SUM(B2:B{tr-1})").font = Font(name="Arial", bold=True, size=10)
for col_idx in range(1, len(COLS) + 1):
ws.cell(row=tr, column=col_idx).fill = _fill("D0D8F0")
ws.cell(row=tr, column=col_idx).border = border
ws.auto_filter.ref = f"A1:{get_column_letter(len(COLS))}1"
# Apply role filter — '' means all roles
if role == "student":
_items = [i for i in state.flagged_items if i.get("user_role") == "student"]
elif role == "staff":
_items = [i for i in state.flagged_items if i.get("user_role") != "student"]
else:
_items = list(state.flagged_items)
wb = Workbook()
ws_sum = wb.active
ws_sum.title = "Summary"
ws_sum.sheet_properties.tabColor = "1F3864"
_role_label = {"student": " — Elever", "staff": " — Ansatte"}.get(role, "")
ws_sum["A1"] = f"GDPRScanner — Export{_role_label}"
ws_sum["A1"].font = Font(name="Arial", bold=True, size=14, color=HEADER_FG)
ws_sum["A1"].fill = _fill(HEADER_BG)
ws_sum.merge_cells("A1:D1")
ws_sum["A1"].alignment = Alignment(horizontal="center", vertical="center")
ws_sum.row_dimensions[1].height = 28
import datetime as _dt
ws_sum["A2"] = "Generated:"
ws_sum["B2"] = _dt.datetime.now().strftime("%Y-%m-%d %H:%M")
ws_sum["A3"] = "Total flagged items:"
ws_sum["B3"] = len(_items)
gps_count = sum(1 for i in _items if (i.get("exif") or {}).get("gps"))
if gps_count:
ws_sum["A4"] = "Items with GPS data:"
ws_sum["B4"] = gps_count
for cell in (ws_sum["A2"], ws_sum["A3"], ws_sum["A4"]):
cell.font = Font(name="Arial", bold=True, size=10)
for cell in (ws_sum["B2"], ws_sum["B3"], ws_sum["B4"]):
cell.font = Font(name="Arial", size=10)
ws_sum.column_dimensions["A"].width = 22
ws_sum.column_dimensions["B"].width = 20
for ci, h in enumerate(["Source", "Items", "Total CPR Hits"], 1):
cell = ws_sum.cell(row=6, column=ci, value=h)
cell.font = Font(name="Arial", bold=True, color=HEADER_FG, size=10)
cell.fill = _fill(HEADER_BG)
cell.border = border
cell.alignment = Alignment(horizontal="center", vertical="center")
ws_sum.row_dimensions[6].height = 18
ws_sum.column_dimensions["C"].width = 16
by_source: dict = {}
for item in _items:
by_source.setdefault(item.get("source_type", "other"), []).append(item)
# Determine which sources were actually scanned (even if they found nothing)
scanned_sources: set = set()
if DB_OK:
try:
_db_tmp = _get_db()
if _db_tmp:
scanned_sources = _db_tmp.get_session_sources()
except Exception:
pass
# Fall back: treat any source that has items as scanned
scanned_sources |= set(by_source.keys())
sum_row = 7
for src_key, (label, tab_bg) in SOURCE_MAP.items():
if src_key not in scanned_sources:
continue
items = by_source.get(src_key, [])
ws_sum.cell(row=sum_row, column=1, value=label).font = Font(name="Arial", size=10)
ws_sum.cell(row=sum_row, column=2, value=len(items)).font = Font(name="Arial", size=10)
ws_sum.cell(row=sum_row, column=3, value=sum(i.get("cpr_count", 0) for i in items)).font = Font(name="Arial", size=10)
for ci in range(1, 4):
ws_sum.cell(row=sum_row, column=ci).border = border
ws_sum.cell(row=sum_row, column=ci).fill = _fill("EEF2FF" if sum_row % 2 == 0 else "FFFFFF")
sum_row += 1
for src_key, (label, tab_bg) in SOURCE_MAP.items():
items = by_source.get(src_key, [])
if not items:
continue
clean_label = label.split(" ", 1)[1]
_write_sheet(wb.create_sheet(title=clean_label), items, tab_bg)
# GPS items sheet
gps_items = [i for i in _items if (i.get("exif") or {}).get("gps")]
if gps_items:
ws_gps = wb.create_sheet(title="GPS locations")
ws_gps.sheet_properties.tabColor = "1A7A6E"
GPS_COLS = [
("Name", 40), ("Latitude", 14), ("Longitude", 14),
("Maps link", 50), ("Account", 24), ("Date Modified", 14),
]
for col_idx, (col_name, col_w) in enumerate(GPS_COLS, 1):
cell = ws_gps.cell(row=1, column=col_idx, value=col_name)
cell.font = Font(name="Arial", bold=True, color=HEADER_FG, size=10)
cell.fill = _fill("1A7A6E")
cell.alignment = Alignment(horizontal="center", vertical="center")
cell.border = border
ws_gps.column_dimensions[get_column_letter(col_idx)].width = col_w
ws_gps.freeze_panes = "A2"
for r_idx, item in enumerate(gps_items, 2):
_exif = item.get("exif") or {}
_gps = _exif.get("gps") or {}
row_fill = _fill("E0F7F4" if r_idx % 2 == 0 else "FFFFFF")
for col_idx, val in enumerate([
item.get("name", ""),
_gps.get("lat", ""),
_gps.get("lon", ""),
_gps.get("maps_url", ""),
item.get("account_name", "") or item.get("source", ""),
item.get("modified", ""),
], 1):
is_link = col_idx == 4 and val
cell = ws_gps.cell(row=r_idx, column=col_idx, value=val)
cell.font = Font(name="Arial", size=10,
color="1155CC" if is_link else "000000",
underline="single" if is_link else None)
cell.fill = row_fill
cell.border = border
ws_gps.auto_filter.ref = f"A1:{get_column_letter(len(GPS_COLS))}1"
# External transfers sheet
ext_items = [i for i in _items
if i.get("transfer_risk") in ("external-recipient", "external-share", "shared")]
if ext_items:
ws_ext = wb.create_sheet(title="External transfers")
_write_sheet(ws_ext, ext_items, "E74C3C")
ws_ext.sheet_properties.tabColor = "E74C3C"
ws_sum.cell(row=sum_row, column=1, value="⚠ External transfers").font = Font(name="Arial", size=10, bold=True, color="E74C3C")
ws_sum.cell(row=sum_row, column=2, value=len(ext_items)).font = Font(name="Arial", size=10, bold=True, color="E74C3C")
ws_sum.cell(row=sum_row, column=3, value=sum(i.get("cpr_count", 0) for i in ext_items)).font = Font(name="Arial", size=10, bold=True, color="E74C3C")
for ci in range(1, 4):
ws_sum.cell(row=sum_row, column=ci).border = border
ws_sum.cell(row=sum_row, column=ci).fill = _fill("FDE8E8")
buf = io.BytesIO()
wb.save(buf)
buf.seek(0)
_role_suffix = {"student": "_elever", "staff": "_ansatte"}.get(role, "")
fname = f"m365_scan{_role_suffix}_{_dt.datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx"
return buf.read(), fname
@bp.route("/api/export_excel")
def export_excel():
"""Export flagged items as an Excel workbook with per-source tabs."""
# If in-memory list is empty (e.g. after page reload), try loading from DB.
# Use get_session_items() so concurrent M365 + Google + File scans (each with
# their own scan_id) are all included, not just the single latest scan_id.
if not state.flagged_items and DB_OK:
try:
db = _get_db()
if db:
db_items = db.get_session_items()
if db_items:
state.flagged_items[:] = db_items
except Exception:
pass
role = request.args.get("role", "")
try:
xl_bytes, fname = _build_excel_bytes(role=role)
return Response(
xl_bytes,
mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
headers={"Content-Disposition": f"attachment; filename={fname}"}
)
except ImportError:
return jsonify({"error": "openpyxl not installed — run: pip install openpyxl"}), 500
except Exception as e:
import traceback
logger.error("export_excel error: %s\n%s", e, traceback.format_exc())
return jsonify({"error": str(e)}), 500
# ── Article 30 report ─────────────────────────────────────────────────────────
def _build_article30_docx(role: str = "") -> tuple[bytes, str]:
"""Generate a GDPR Article 30 Register of Processing Activities as .docx.
Returns (bytes, filename). Strings are translated using the active state.LANG dict.
role: '' = all, 'student' = students only, 'staff' = staff + other."""
try:
from docx import Document as _Document
from docx.shared import Pt, RGBColor, Inches, Cm
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
except ImportError:
raise ImportError("python-docx not installed — run: pip install python-docx")
import datetime as _dt
# Translate helper — falls back to English default if key missing
def L(key: str, default: str = "") -> str:
return state.LANG.get(key, default)
# ── Data ─────────────────────────────────────────────────────────────────
db = _get_db() if DB_OK else None
stats = db.get_stats() if db else {}
items = db.get_session_items() if db else list(state.flagged_items)
if role == "student":
items = [i for i in items if i.get("user_role") == "student"]
elif role == "staff":
items = [i for i in items if i.get("user_role") != "student"]
trend = db.get_trend(10) if db else []
overdue = db.get_overdue_items(5) if db else []
# Build account_id → display_name map from the scan's stored user_ids
# This lets us resolve GUIDs and "Microsoft Konto" placeholders that
# were stored in account_name before _resolve_display_name was applied.
_acct_map: dict[str, str] = {}
if db:
try:
scan_id = stats.get("scan_id") or db.latest_scan_id()
if scan_id:
row = db._connect().execute(
"SELECT user_count, options FROM scans WHERE id=?", (scan_id,)
).fetchone()
# user_ids are stored in the options JSON column
opts_json = json.loads(row["options"] or "{}") if row else {}
for u in opts_json.get("user_ids", []):
uid = u.get("id", "")
name = u.get("displayName", "")
if uid and name:
_acct_map[uid] = name
except Exception:
pass
# Also seed from in-memory state.flagged_items (catches current scan not yet in DB)
for item in state.flagged_items:
aid = item.get("account_id", "")
name = item.get("account_name", "")
if aid and name and not _GUID_RE.match(name.strip()):
_acct_map.setdefault(aid, name)
def _acct_label(item: dict) -> str:
"""Return the best human-readable account label for an item."""
aid = item.get("account_id", "")
name = item.get("account_name", "")
# Try the lookup map first (most reliable — built from scan user_ids)
if aid and aid in _acct_map:
return _acct_map[aid]
# Fall back to stored name, resolving GUIDs/placeholders against account_id
return _resolve_display_name(name, aid)
overdue_ids = {o["id"] for o in overdue}
now_str = _dt.datetime.now().strftime("%Y-%m-%d %H:%M")
date_str = _dt.datetime.now().strftime("%Y-%m-%d")
_role_suffix = {"student": "_elever", "staff": "_ansatte"}.get(role, "")
fname = f"article30{_role_suffix}_{date_str}.docx"
# Aggregate by source
by_source: dict = {}
for item in items:
st = item.get("source_type", "other")
by_source.setdefault(st, []).append(item)
# Determine which sources were actually scanned (may be empty-hit)
scanned_sources: set = set()
if db:
try:
scanned_sources = db.get_session_sources()
except Exception:
pass
scanned_sources |= set(by_source.keys())
SOURCE_LABELS = {
"email": "Exchange (Outlook)",
"onedrive": "OneDrive",
"sharepoint": "SharePoint",
"teams": "Teams",
"gmail": "Gmail",
"gdrive": "Google Drive",
"local": "Local files",
"smb": "Network / SMB",
"sftp": "SFTP",
}
# ── Colour palette ────────────────────────────────────────────────────────
DARK_BLUE = RGBColor(0x1F, 0x38, 0x64)
MID_BLUE = RGBColor(0x00, 0x78, 0xD4)
LIGHT_GREY = RGBColor(0xF2, 0xF2, 0xF2)
RED = RGBColor(0xC0, 0x39, 0x2B)
ORANGE = RGBColor(0xC5, 0x5A, 0x00)
WHITE = RGBColor(0xFF, 0xFF, 0xFF)
def _hex(c: RGBColor) -> str:
return f"{c[0]:02X}{c[1]:02X}{c[2]:02X}"
# ── Document setup ────────────────────────────────────────────────────────
doc = _Document()
doc.core_properties.title = "GDPR Article 30 — Register of Processing Activities"
doc.core_properties.author = "GDPRScanner"
doc.core_properties.subject = "GDPR Compliance"
# Page margins — A4 with 2.5 cm margins
for section in doc.sections:
section.top_margin = Cm(2.5)
section.bottom_margin = Cm(2.5)
section.left_margin = Cm(2.5)
section.right_margin = Cm(2.5)
# ── Helper: set cell background ──────────────────────────────────────────
def _cell_bg(cell, hex_color: str):
tc = cell._tc
tcPr = tc.get_or_add_tcPr()
shd = OxmlElement("w:shd")
shd.set(qn("w:val"), "clear")
shd.set(qn("w:color"), "auto")
shd.set(qn("w:fill"), hex_color)
tcPr.append(shd)
def _set_cell_border(cell, **kwargs):
tc = cell._tc
tcPr = tc.get_or_add_tcPr()
tcBorders = OxmlElement("w:tcBorders")
for edge in ("top", "left", "bottom", "right"):
cfg = kwargs.get(edge, {})
el = OxmlElement(f"w:{edge}")
el.set(qn("w:val"), cfg.get("val", "single"))
el.set(qn("w:sz"), cfg.get("sz", "4"))
el.set(qn("w:space"), cfg.get("space", "0"))
el.set(qn("w:color"), cfg.get("color", "CCCCCC"))
tcBorders.append(el)
tcPr.append(tcBorders)
def _para(text: str = "", bold=False, size=11, color=None,
align=WD_ALIGN_PARAGRAPH.LEFT, space_before=0, space_after=6) -> object:
p = doc.add_paragraph()
p.alignment = align
p.paragraph_format.space_before = Pt(space_before)
p.paragraph_format.space_after = Pt(space_after)
if text:
run = p.add_run(text)
run.bold = bold
run.font.size = Pt(size)
if color:
run.font.color.rgb = color
return p
def _heading(text: str, level: int = 1):
p = doc.add_heading(text, level=level)
r = p.runs[0] if p.runs else p.add_run(text)
r.font.color.rgb = DARK_BLUE
r.font.size = Pt(16 if level == 1 else 13)
r.bold = True
p.paragraph_format.space_before = Pt(14 if level == 1 else 10)
p.paragraph_format.space_after = Pt(4)
return p
def _kv(label: str, value: str, label_width=2.5, bold=False, highlight=False):
"""Two-column key-value paragraph using a 2-cell table row."""
tbl = doc.add_table(rows=1, cols=2)
tbl.style = "Table Grid"
w_label = int(label_width * 1440)
w_value = int((16.0 - label_width) * 1440 * 0.6) # approx content width
c1, c2 = tbl.rows[0].cells
_cell_bg(c1, "FFF3E0" if highlight else "F2F2F2")
_cell_bg(c2, "FFF3E0" if highlight else "FFFFFF")
c1.width = Inches(label_width)
c2.width = Inches(16.0 - label_width)
p1 = c1.paragraphs[0]; p1.clear()
r1 = p1.add_run(label); r1.bold = True; r1.font.size = Pt(10)
p2 = c2.paragraphs[0]; p2.clear()
r2 = p2.add_run(value); r2.font.size = Pt(10); r2.bold = bold
if highlight:
r1.font.color.rgb = RGBColor(0x6B, 0x00, 0x6B)
r2.font.color.rgb = RGBColor(0x6B, 0x00, 0x6B)
for cell in (c1, c2):
_set_cell_border(cell, top={"color": "E0E0E0"}, bottom={"color": "E0E0E0"},
left={"color": "E0E0E0"}, right={"color": "E0E0E0"})
return tbl
# ── Cover page ────────────────────────────────────────────────────────────
_para()
title_p = doc.add_paragraph()
title_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
title_p.paragraph_format.space_before = Pt(40)
r = title_p.add_run(L("a30_title", "GDPR Article 30"))
r.bold = True; r.font.size = Pt(28); r.font.color.rgb = DARK_BLUE
sub_p = doc.add_paragraph()
sub_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
r2 = sub_p.add_run(L("a30_subtitle", "Register of Processing Activities"))
r2.font.size = Pt(16); r2.font.color.rgb = MID_BLUE
_para()
meta_p = doc.add_paragraph()
meta_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
r3 = meta_p.add_run(f"{L('a30_generated','Generated')}: {now_str} · GDPRScanner")
r3.font.size = Pt(10); r3.font.color.rgb = RGBColor(0x88, 0x88, 0x88)
# Divider line
_para()
div = doc.add_paragraph()
div_fmt = div.paragraph_format
div_fmt.space_after = Pt(20)
pPr = div._p.get_or_add_pPr()
pBdr = OxmlElement("w:pBdr")
bot = OxmlElement("w:bottom")
bot.set(qn("w:val"), "single"); bot.set(qn("w:sz"), "6")
bot.set(qn("w:color"), _hex(MID_BLUE))
pBdr.append(bot); pPr.append(pBdr)
doc.add_page_break()
# ── Section 1: Summary ────────────────────────────────────────────────────
_heading(L("a30_s1", "1. Summary"))
total_items = len(items)
total_cpr = sum(i.get("cpr_count", 0) for i in items)
special_items = [i for i in items if i.get("special_category") and
i["special_category"] not in ("[]", "", None, [])]
photo_items = [i for i in items if i.get("face_count", 0) > 0]
gps_items = [i for i in items if "gps_location" in (i.get("special_category") or [])]
exif_pii_items = [i for i in items if "exif_pii" in (i.get("special_category") or [])]
unique_subj = stats.get("unique_subjects", 0)
total_scanned = stats.get("total_scanned", 0)
scan_date = _dt.datetime.fromtimestamp(
stats.get("started_at", 0)).strftime("%Y-%m-%d %H:%M") if stats.get("started_at") else ""
special_items = [i for i in items if i.get("special_category") and
i["special_category"] not in ("[]", "", None, [])]
_kv(L("a30_scan_date", "Scan date"), scan_date)
_kv(L("a30_items_scanned", "Items scanned"), str(total_scanned))
_kv(L("a30_flagged", "Flagged items"), str(total_items))
_kv(L("a30_cpr_hits", "Total CPR hits"), str(total_cpr))
_kv(L("a30_data_subjects", "Estimated data subjects"), str(unique_subj))
_kv(L("a30_overdue", "Overdue items (>5 yrs)"), str(len(overdue_ids)))
if gps_items:
_kv(L("a30_gps_items", "Items with GPS location data (Art. 4 — location = personal data)"),
str(len(gps_items)))
if exif_pii_items:
_kv(L("a30_exif_pii_items", "Items with EXIF PII (author, description, keywords)"),
str(len(exif_pii_items)))
if photo_items:
total_faces = sum(i.get("face_count", 0) for i in photo_items)
_kv(L("a30_photo_items", "Photos with detected faces (Art. 9 biometric)"),
f"{len(photo_items)} items / {total_faces} faces")
_para(L("a30_photo_note",
"Photographs of identifiable persons are biometric data under Art. 9 GDPR. "
"Retention requires a documented legal basis under Art. 9(2). "
"For school photographs of pupils under 15, parental consent is required "
"(Databeskyttelsesloven §6). See Datatilsynet guidance on school photography."),
size=9, space_after=4)
if special_items:
_kv(L("a30_special_cat", "Art. 9 special category items"),
str(len(special_items)))
_para(L("a30_special_cat_note",
"These items contain health, criminal, biometric, religious, ethnic, "
"trade union, political, or sexual orientation data. "
"An explicit legal basis (Art. 9(2)) and possibly a DPIA (Art. 35) is required."),
size=9, space_after=4)
_para()
# Per-source breakdown table
_para(L("a30_by_source", "Breakdown by source"), bold=True, size=11, space_before=10)
src_tbl = doc.add_table(rows=1, cols=5)
src_tbl.style = "Table Grid"
hdr_cells = src_tbl.rows[0].cells
for cell, txt in zip(hdr_cells, [L("a30_col_source","Source"), L("a30_col_items","Items"),
L("a30_col_cpr","CPR hits"), L("a30_col_overdue","Overdue"),
L("a30_col_special","Art. 9")]):
_cell_bg(cell, _hex(DARK_BLUE))
p = cell.paragraphs[0]; p.clear()
r = p.add_run(txt); r.bold = True
r.font.size = Pt(10); r.font.color.rgb = WHITE
for src_key in ("email", "onedrive", "sharepoint", "teams", "gmail", "gdrive", "local", "smb", "sftp"):
if src_key not in scanned_sources:
continue
src_items = by_source.get(src_key, [])
row = src_tbl.add_row().cells
n_ov = sum(1 for i in src_items if i.get("id") in overdue_ids)
n_cpr = sum(i.get("cpr_count", 0) for i in src_items)
n_spec = sum(1 for i in src_items if i.get("special_category") and
i["special_category"] not in ("[]", "", None, []))
for cell, val in zip(row, [
SOURCE_LABELS.get(src_key, src_key),
str(len(src_items)), str(n_cpr), str(n_ov),
str(n_spec) if n_spec else ""
]):
p = cell.paragraphs[0]; p.clear()
r = p.add_run(val); r.font.size = Pt(10)
if val != "0" and cell == row[3]:
r.font.color.rgb = ORANGE
if n_spec and cell == row[4]:
r.font.color.rgb = RGBColor(0x7B, 0x00, 0x82)
r.bold = True
# ── Section 2: Data categories ────────────────────────────────────────────
doc.add_page_break()
_heading(L("a30_s2", "2. Personal Data Categories Identified"))
_para(L("a30_s2_intro", "The following categories of personal data were detected during scanning."),
size=10, space_after=8)
# Aggregate PII from DB or from items
pii_totals: dict = {}
if db:
rows = db._connect().execute(
"""SELECT pii_type, SUM(hit_count) FROM pii_hits
WHERE scan_id=? GROUP BY pii_type""",
(stats.get("scan_id") or db.latest_scan_id() or 0,)
).fetchall()
for pii_type, count in rows:
pii_totals[pii_type] = count
PII_LABELS = {
"PHONE": L("a30_pii_phone", "Phone numbers"),
"EMAIL": L("a30_pii_email", "Email addresses"),
"IBAN": L("a30_pii_iban", "IBAN bank numbers"),
"BANK_ACCOUNT": L("a30_pii_bank", "Bank account numbers"),
"NAME": L("a30_pii_name", "Personal names (NER)"),
"ADDRESS": L("a30_pii_address", "Addresses (NER)"),
"ORG": L("a30_pii_org", "Organisations (NER)"),
}
pii_tbl = doc.add_table(rows=1, cols=3)
pii_tbl.style = "Table Grid"
for cell, txt in zip(pii_tbl.rows[0].cells,
[L("a30_col_category","Data category"), L("a30_col_count","Count"), L("a30_col_gdpr_class","GDPR classification")]):
_cell_bg(cell, _hex(DARK_BLUE))
p = cell.paragraphs[0]; p.clear()
r = p.add_run(txt); r.bold = True
r.font.size = Pt(10); r.font.color.rgb = WHITE
# CPR row first — always
cpr_row = pii_tbl.add_row().cells
for cell, val in zip(cpr_row, [L("a30_cpr_label", "CPR numbers (Danish personal ID)"), str(total_cpr),
L("a30_cpr_class", "Art. 9 — national identifier")]):
p = cell.paragraphs[0]; p.clear()
r = p.add_run(val); r.font.size = Pt(10)
_cpr_class = L("a30_cpr_class", "Art. 9 — national identifier")
if val == _cpr_class:
r.font.color.rgb = RED; r.bold = True
for pii_type, label in PII_LABELS.items():
count = pii_totals.get(pii_type, 0)
if not count:
continue
cls = L("a30_pii_class_9", "Art. 9 — health/sensitive") if pii_type in ("NAME", "ADDRESS") else L("a30_pii_class_4", "Art. 4 — personal data")
row = pii_tbl.add_row().cells
for cell, val in zip(row, [label, str(count), cls]):
p = cell.paragraphs[0]; p.clear()
r = p.add_run(val); r.font.size = Pt(10)
# ── Section 3: Data inventory ─────────────────────────────────────────────
doc.add_page_break()
_heading(L("a30_s3", "3. Data Inventory"))
_para(L("a30_s3_intro", "All flagged items are listed below with location, retention status, and compliance disposition."),
size=10, space_after=8)
# Split by user role for separate presentation
student_items = [i for i in items if i.get("user_role") == "student"]
staff_items = [i for i in items if i.get("user_role") != "student"]
_disp_map = {
"unreviewed": L("a30_disp_unreviewed", "Unreviewed"),
"retain-legal": L("a30_disp_retain_legal", "Retain — Legal obligation"),
"retain-legitimate": L("a30_disp_retain_legit", "Retain — Legitimate interest"),
"retain-contract": L("a30_disp_retain_contract", "Retain — Contract"),
"delete-scheduled": L("a30_disp_delete_sched", "Delete — Scheduled"),
"deleted": L("a30_disp_deleted", "Deleted"),
"personal-use": L("a30_disp_personal_use", "Personal use — out of GDPR scope (Art. 2(2)(c))"),
}
def _inv_table(tbl_items: list):
tbl = doc.add_table(rows=1, cols=6)
tbl.style = "Table Grid"
col_hdrs = [L("a30_col_name","Name / Subject"), L("a30_col_source","Source"),
L("a30_col_account","Account"), L("a30_col_modified","Modified"),
L("a30_col_cpr_short","CPR"), L("a30_col_disp","Disposition")]
for cell, txt in zip(tbl.rows[0].cells, col_hdrs):
_cell_bg(cell, _hex(DARK_BLUE))
p = cell.paragraphs[0]; p.clear()
r = p.add_run(txt); r.bold = True
r.font.size = Pt(9); r.font.color.rgb = WHITE
sorted_tbl = sorted(tbl_items,
key=lambda x: (0 if x.get("id") in overdue_ids else 1, -x.get("cpr_count", 0)))
for idx, item in enumerate(sorted_tbl[:500]):
disp_rec = db.get_disposition(item["id"]) if db else None
raw_disp = disp_rec.get("status", "unreviewed") if disp_rec else "unreviewed"
disp_str = _disp_map.get(raw_disp, raw_disp.replace("-", " ").title())
is_ov = item.get("id") in overdue_ids
row = tbl.add_row().cells
vals = [
(item.get("name", "")[:60] + ("" if len(item.get("name", "")) > 60 else "")),
SOURCE_LABELS.get(item.get("source_type", ""), item.get("source_type", "")),
_acct_label(item),
item.get("modified", ""),
str(item.get("cpr_count", 0)),
disp_str,
]
bg = "FFF8F0" if is_ov else ("FFFFFF" if idx % 2 == 0 else "F8F8F8")
for cell, val in zip(row, vals):
_cell_bg(cell, bg)
p = cell.paragraphs[0]; p.clear()
r = p.add_run(val); r.font.size = Pt(8)
if is_ov and cell == row[3]:
r.font.color.rgb = ORANGE
if len(tbl_items) > 500:
_para(f"{len(tbl_items) - 500} {L('a30_more_items', 'additional items not shown.')}",
size=9, color=RGBColor(0x88, 0x88, 0x88), space_before=4)
if staff_items:
if student_items:
_para(L("a30_inv_staff", "👔 Staff / Faculty"), bold=True, size=11, space_before=6, space_after=4)
_inv_table(staff_items)
if student_items:
_para(L("a30_inv_students", "🎓 Students"), bold=True, size=11, space_before=14, space_after=2)
_para(L("a30_student_consent_note",
"Note: Student accounts in Danish folkeskole (pupils under age 15) require parental "
"consent for processing of personal data under Databeskyttelsesloven §6. "
"Items in student accounts must not be auto-deleted — any action requires "
"review by school administration and, for pupils under 15, notification of parents "
"or guardians as rights holders under GDPR Article 8."),
size=9, color=RGBColor(0x88, 0x44, 0x00), space_after=6)
_inv_table(student_items)
# ── Section 4: Retention analysis ────────────────────────────────────────
if overdue:
doc.add_page_break()
_heading(L("a30_s4", "4. Retention Analysis"))
_para(L("a30_s4_intro", "The following items exceed the 5-year retention threshold and should be reviewed for deletion under GDPR Article 5(1)(e) — storage limitation."),
size=10, space_after=8)
ret_tbl = doc.add_table(rows=1, cols=5)
ret_tbl.style = "Table Grid"
for cell, txt in zip(ret_tbl.rows[0].cells,
[L("a30_col_name","Name"), L("a30_col_source","Source"), L("a30_col_account","Account"), L("a30_col_modified","Modified"), L("a30_col_cpr","CPR hits")]):
_cell_bg(cell, _hex(ORANGE))
p = cell.paragraphs[0]; p.clear()
r = p.add_run(txt); r.bold = True
r.font.size = Pt(9); r.font.color.rgb = WHITE
for item in overdue[:200]:
row = ret_tbl.add_row().cells
for cell, val in zip(row, [
item.get("name", "")[:55],
SOURCE_LABELS.get(item.get("source_type", ""), ""),
_acct_label(item),
item.get("modified", ""),
str(item.get("cpr_count", 0)),
]):
p = cell.paragraphs[0]; p.clear()
r = p.add_run(val); r.font.size = Pt(8)
# ── Section 5: Scan history ───────────────────────────────────────────────
if trend:
sec_num = "5" if overdue else "4"
doc.add_page_break()
_heading(f"{sec_num}. {L('a30_s5','Compliance Trend').split('. ',1)[-1]}")
_para(L("a30_s5_intro", "Flagged item counts over the last scans (most recent first)."),
size=10, space_after=8)
trend_tbl = doc.add_table(rows=1, cols=4)
trend_tbl.style = "Table Grid"
for cell, txt in zip(trend_tbl.rows[0].cells,
[L("a30_col_scan_date","Scan date"), L("a30_col_flagged","Flagged"), L("a30_col_overdue","Overdue"), L("a30_col_scan_type","Scan type")]):
_cell_bg(cell, _hex(DARK_BLUE))
p = cell.paragraphs[0]; p.clear()
r = p.add_run(txt); r.bold = True
r.font.size = Pt(9); r.font.color.rgb = WHITE
for t in reversed(trend):
row = trend_tbl.add_row().cells
for cell, val in zip(row, [
t.get("scan_date", ""),
str(t.get("flagged_count", 0)),
str(t.get("overdue_count", 0)),
L("a30_scan_delta", "Delta") if t.get("delta") else L("a30_scan_full", "Full"),
]):
p = cell.paragraphs[0]; p.clear()
r = p.add_run(val); r.font.size = Pt(9)
# ── Section: Deletion audit log ───────────────────────────────────────────
del_log = db.get_deletion_log(limit=500) if db else []
del_stats = db.deletion_log_stats() if db else {}
# Running section counter — starts at 3 (summary, categories, inventory always present)
last_sec = 3
last_sec += 1 if overdue else 0 # retention analysis
last_sec += 1 if trend else 0 # compliance trend
if del_log:
del_sec = last_sec
last_sec += 1
doc.add_page_break()
_heading(f"{del_sec}. {L('a30_s_dellog', 'Deletion Audit Log')}")
_para(L("a30_dellog_intro",
f"A total of {del_stats.get('total', len(del_log))} item(s) containing personal data "
f"have been deleted via GDPRScanner. "
f"CPR hits removed: {del_stats.get('cpr_hits_deleted', 0)}. "
f"This log satisfies the accountability obligation under GDPR Article 5(2)."),
size=10, space_after=8)
# Summary by reason
by_reason = del_stats.get("by_reason", {})
if by_reason:
_para(L("a30_dellog_by_reason", "Deletions by reason"), bold=True, size=10, space_before=4, space_after=4)
reason_tbl = doc.add_table(rows=1, cols=2)
reason_tbl.style = "Table Grid"
for cell, txt in zip(reason_tbl.rows[0].cells,
[L("a30_col_reason", "Reason"), L("a30_col_count", "Count")]):
_cell_bg(cell, _hex(DARK_BLUE))
p = cell.paragraphs[0]; p.clear()
r = p.add_run(txt); r.bold = True
r.font.size = Pt(9); r.font.color.rgb = WHITE
REASON_LABELS = {
"manual": L("a30_reason_manual", "Manual (individual card delete)"),
"bulk": L("a30_reason_bulk", "Bulk delete"),
"retention": L("a30_reason_retention", "Retention policy enforcement"),
"data-subject-request": L("a30_reason_dsr", "Data subject erasure request (Art. 17)"),
}
for reason, count in sorted(by_reason.items()):
row = reason_tbl.add_row().cells
for cell, val in zip(row, [REASON_LABELS.get(reason, reason), str(count)]):
p = cell.paragraphs[0]; p.clear()
r = p.add_run(val); r.font.size = Pt(9)
# Full log table
_para(L("a30_dellog_records", "Deletion records"), bold=True, size=10, space_before=10, space_after=4)
log_tbl = doc.add_table(rows=1, cols=7)
log_tbl.style = "Table Grid"
for cell, txt in zip(log_tbl.rows[0].cells, [
L("a30_col_deleted_at", "Deleted at"),
L("a30_col_name", "Name"),
L("a30_col_source", "Source"),
L("a30_col_account", "Account"),
L("a30_col_cpr", "CPR hits"),
L("a30_col_reason", "Reason"),
L("a30_col_deleted_by", "Deleted by"),
]):
_cell_bg(cell, _hex(DARK_BLUE))
p = cell.paragraphs[0]; p.clear()
r = p.add_run(txt); r.bold = True
r.font.size = Pt(8); r.font.color.rgb = WHITE
for idx, entry in enumerate(del_log):
ts = _dt.datetime.fromtimestamp(entry.get("deleted_at", 0)).strftime("%Y-%m-%d %H:%M")
bg = "FFFFFF" if idx % 2 == 0 else "F8F8F8"
row = log_tbl.add_row().cells
for cell, val in zip(row, [
ts,
entry.get("item_name", "")[:40],
SOURCE_LABELS.get(entry.get("source_type", ""), entry.get("source_type", "")),
_acct_map.get(entry.get("account_id", "")) or _resolve_display_name(entry.get("account_name", ""), entry.get("account_id", "")),
str(entry.get("cpr_count", 0)),
REASON_LABELS.get(entry.get("reason", ""), entry.get("reason", "")),
entry.get("deleted_by", "") or "",
]):
_cell_bg(cell, bg)
p = cell.paragraphs[0]; p.clear()
r = p.add_run(val); r.font.size = Pt(7)
# ── Section: Article 9 special categories ────────────────────────────────
if special_items:
last_sec += 1
doc.add_page_break()
_heading(f"{last_sec}. {L('a30_s_special', 'Special Category Data (Article 9)')}")
_para(L("a30_special_intro",
f"{len(special_items)} item(s) were detected as containing special category "
f"data under GDPR Article 9. These require an explicit legal basis beyond "
f"Article 6, and processing should be covered by a Data Protection Impact "
f"Assessment (DPIA) under Article 35."),
size=10, space_after=8)
# Category breakdown table
from collections import Counter as _Counter
cat_counts: dict = _Counter()
for item in special_items:
sc = item.get("special_category", [])
if isinstance(sc, str):
import json as _scjson
try:
sc = _scjson.loads(sc)
except Exception:
sc = []
for c in sc:
cat_counts[c] += 1
if cat_counts:
_para(L("a30_special_by_cat", "Detected categories"), bold=True, size=10,
space_before=4, space_after=4)
cat_tbl = doc.add_table(rows=1, cols=2)
cat_tbl.style = "Table Grid"
for cell, txt in zip(cat_tbl.rows[0].cells,
[L("a30_col_category", "Category"),
L("a30_col_count", "Items")]):
_cell_bg(cell, _hex(DARK_BLUE))
p = cell.paragraphs[0]; p.clear()
r = p.add_run(txt); r.bold = True
r.font.size = Pt(9); r.font.color.rgb = WHITE
CAT_LABELS = {
"health": L("a30_cat_health", "Health data (Art. 9)"),
"mental_health": L("a30_cat_mental", "Mental health (Art. 9)"),
"criminal": L("a30_cat_criminal", "Criminal records (Art. 10)"),
"trade_union": L("a30_cat_union", "Trade union membership (Art. 9)"),
"religion": L("a30_cat_religion", "Religious beliefs (Art. 9)"),
"ethnicity": L("a30_cat_ethnicity", "Racial/ethnic origin (Art. 9)"),
"political": L("a30_cat_political", "Political opinions (Art. 9)"),
"biometric": L("a30_cat_biometric", "Biometric data (Art. 9)"),
"sexual_orientation": L("a30_cat_sexual", "Sexual orientation (Art. 9)"),
}
for cat, count in sorted(cat_counts.items(), key=lambda x: -x[1]):
row = cat_tbl.add_row().cells
for cell, val in zip(row, [CAT_LABELS.get(cat, cat), str(count)]):
p = cell.paragraphs[0]; p.clear()
r = p.add_run(val); r.font.size = Pt(9)
# Item list (capped at 50)
_para(L("a30_special_items", "Affected items (up to 50)"), bold=True, size=10,
space_before=10, space_after=4)
sc_tbl = doc.add_table(rows=1, cols=5)
sc_tbl.style = "Table Grid"
for cell, txt in zip(sc_tbl.rows[0].cells, [
L("a30_col_name", "Name"),
L("a30_col_account", "Account"),
L("a30_col_source", "Source"),
L("a30_col_category", "Category"),
L("a30_col_cpr", "CPR hits"),
]):
_cell_bg(cell, _hex(DARK_BLUE))
p = cell.paragraphs[0]; p.clear()
r = p.add_run(txt); r.bold = True
r.font.size = Pt(8); r.font.color.rgb = WHITE
for idx, item in enumerate(special_items[:50]):
bg = "FFFFFF" if idx % 2 == 0 else "FFF0F8"
sc = item.get("special_category", [])
if isinstance(sc, str):
try:
import json as _scj2; sc = _scj2.loads(sc)
except Exception:
sc = []
row = sc_tbl.add_row().cells
for cell, val in zip(row, [
item.get("name", "")[:35],
_acct_map.get(item.get("account_id", "")) or item.get("account_name", ""),
SOURCE_LABELS.get(item.get("source_type", ""), item.get("source_type", "")),
", ".join(CAT_LABELS.get(c, c) for c in sc)[:45],
str(item.get("cpr_count", 0)),
]):
_cell_bg(cell, bg)
p = cell.paragraphs[0]; p.clear()
r = p.add_run(val); r.font.size = Pt(7)
# ── Section: Photographs / biometric data (#9) ───────────────────────────
if photo_items:
last_sec += 1
doc.add_page_break()
_heading(f"{last_sec}. {L('a30_s_photos', 'Photographs and Biometric Data (Article 9)')}")
total_faces = sum(i.get("face_count", 0) for i in photo_items)
_para(L("a30_photo_intro",
f"{len(photo_items)} image file(s) containing {total_faces} detected face(s) "
f"were found in the scan. Photographs of identifiable persons constitute "
f"biometric data under GDPR Article 9 and are subject to the same "
f"heightened protection as health or criminal records data."),
size=10, space_after=8)
_para(L("a30_photo_guidance", "Retention guidance"), bold=True, size=10,
space_before=4, space_after=4)
for line in [
L("a30_photo_g1",
"Photos may only be retained while the original purpose remains valid "
"(Art. 5(1)(b) — purpose limitation)."),
L("a30_photo_g2",
"Pupils under 15 require parental consent (Databeskyttelsesloven §6). "
"Consent must be freely given, specific, and documented."),
L("a30_photo_g3",
"Photos on public-facing websites must be removed promptly after a person "
"leaves the organisation or withdraws consent (Art. 17 — right to erasure)."),
L("a30_photo_g4",
"Historical/archive use may justify longer retention under Art. 89 only "
"with specific safeguards and case-by-case assessment."),
]:
p = doc.add_paragraph(style="List Bullet")
r = p.add_run(line); r.font.size = Pt(9)
# GPS items sub-section
if gps_items:
_para(L("a30_gps_title", "Items with GPS location data"), bold=True, size=10,
space_before=10, space_after=4)
_para(L("a30_gps_intro",
"The following files contain GPS coordinates embedded in EXIF metadata. "
"Location data constitutes personal data under Art. 4 GDPR. For photos of children "
"or staff, GPS data may reveal sensitive patterns (home address, health institution, "
"religious site). Consider stripping EXIF before sharing or publishing."),
size=9, space_after=6)
gps_tbl = doc.add_table(rows=1, cols=4)
gps_tbl.style = "Table Grid"
for cell, txt in zip(gps_tbl.rows[0].cells, [
L("a30_col_name", "Name"),
L("a30_gps_col_lat", "Latitude"),
L("a30_gps_col_lon", "Longitude"),
L("a30_col_date", "Modified"),
]):
_cell_bg(cell, _hex(DARK_BLUE))
p = cell.paragraphs[0]; p.clear()
r = p.add_run(txt); r.bold = True
r.font.size = Pt(8); r.font.color.rgb = WHITE
for idx, item in enumerate(gps_items[:50]):
bg = "FFFFFF" if idx % 2 == 0 else "E8F7FF"
row = gps_tbl.add_row().cells
exif = item.get("exif") or {}
gps = exif.get("gps") or {}
for cell, val in zip(row, [
item.get("name", "")[:40],
str(gps.get("lat", ""))[:12],
str(gps.get("lon", ""))[:12],
item.get("modified", ""),
]):
_cell_bg(cell, bg)
p = cell.paragraphs[0]; p.clear()
r = p.add_run(val); r.font.size = Pt(7)
# Photo item list (capped at 50)
_para(L("a30_photo_items", "Detected photo items (up to 50)"), bold=True, size=10,
space_before=10, space_after=4)
ph_tbl = doc.add_table(rows=1, cols=6)
ph_tbl.style = "Table Grid"
for cell, txt in zip(ph_tbl.rows[0].cells, [
L("a30_col_name", "Name"),
L("a30_col_account", "Account"),
L("a30_col_source", "Source"),
L("a30_photo_col_faces", "Faces"),
L("a30_gps_col", "GPS"),
L("a30_col_date", "Modified"),
]):
_cell_bg(cell, _hex(DARK_BLUE))
p = cell.paragraphs[0]; p.clear()
r = p.add_run(txt); r.bold = True
r.font.size = Pt(8); r.font.color.rgb = WHITE
for idx, item in enumerate(photo_items[:50]):
bg = "FFFFFF" if idx % 2 == 0 else "E8F7FF"
row = ph_tbl.add_row().cells
for cell, val in zip(row, [
item.get("name", "")[:40],
_acct_map.get(item.get("account_id", "")) or item.get("account_name", ""),
SOURCE_LABELS.get(item.get("source_type", ""), item.get("source_type", "")),
str(item.get("face_count", 0)),
"" if (item.get("exif") or {}).get("gps") else "",
item.get("modified", ""),
]):
_cell_bg(cell, bg)
p = cell.paragraphs[0]; p.clear()
r = p.add_run(val); r.font.size = Pt(7)
# ── Section: Methodology ─────────────────────────────────────────────────
# last_sec already reflects all optional sections that were added above
doc.add_page_break()
_heading(f"{last_sec}. {L('a30_s6_short', 'Methodology and Legal Basis')}")
_para(L("a30_method_title", "Scanning methodology"), bold=True, size=11, space_before=6, space_after=4)
for line in [
L("a30_method_1", "CPR numbers are detected using pattern matching against the official Danish CPR format (DDMMYY-XXXX)."),
L("a30_method_2", "Additional personal data (phone numbers, email addresses, IBANs, bank accounts, names, addresses, and organisations) is detected using regular expressions and spaCy NER."),
L("a30_method_3", "CPR numbers stored in this document's database are SHA-256 hashed and never stored in plaintext."),
L("a30_method_4", "Scanning covers Exchange mailboxes (all folders including Sent Items), OneDrive, SharePoint, and Microsoft Teams channel files via the Microsoft Graph API. When connected, Google Workspace scanning covers Gmail and Google Drive via a service account with domain-wide delegation. Local and network (SMB) file shares are scanned directly."),
L("a30_method_5", "When photo scanning is enabled, image files are analysed using OpenCV Haar cascade face detection to identify photographs of persons (Art. 9 biometric data)."),
]:
p = doc.add_paragraph(style="List Bullet")
r = p.add_run(line); r.font.size = Pt(10)
_para(L("a30_gdpr_title", "GDPR Articles referenced"), bold=True, size=11, space_before=10, space_after=4)
for line in [
L("a30_gdpr_1", "Article 5(1)(c) — Data minimisation: only necessary data should be retained"),
L("a30_gdpr_2", "Article 5(1)(e) — Storage limitation: data must not be kept longer than necessary"),
L("a30_gdpr_3", "Article 9 — Special categories: health, criminal, trade union, and similar data require explicit legal basis"),
L("a30_gdpr_4", "Article 15 — Right of access: data subjects may request information about their data"),
L("a30_gdpr_5", "Article 17 — Right to erasure: data subjects may request deletion"),
L("a30_gdpr_6", "Article 30 — Records of processing activities: this document satisfies the obligation"),
]:
p = doc.add_paragraph(style="List Bullet")
r = p.add_run(line); r.font.size = Pt(10)
_para(f"{L('a30_generated','Generated')}: {now_str} · GDPRScanner · {L('a30_confidential','Confidential — GDPR compliance document')}",
size=9, color=RGBColor(0x88, 0x88, 0x88), align=WD_ALIGN_PARAGRAPH.CENTER, space_before=20)
# ── Serialise ─────────────────────────────────────────────────────────────
buf = io.BytesIO()
doc.save(buf)
buf.seek(0)
return buf.read(), fname
@bp.route("/api/export_article30")
def export_article30():
"""Generate and return an Article 30 Word document."""
# Pre-populate in-memory list from DB session so _build_article30_docx()
# has state.flagged_items available for the account-name seed (line ~318).
if not state.flagged_items and DB_OK:
try:
db = _get_db()
if db:
db_items = db.get_session_items()
if db_items:
state.flagged_items[:] = db_items
except Exception:
pass
if not state.flagged_items:
return jsonify({"error": "No results to export — run a scan first"}), 400
try:
role = request.args.get("role", "")
docx_bytes, fname = _build_article30_docx(role=role)
return Response(
docx_bytes,
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
headers={"Content-Disposition": f"attachment; filename={fname}"}
)
except ImportError as e:
return jsonify({"error": str(e)}), 500
except Exception as e:
import traceback
logger.error("export_article30 error: %s\n%s", e, traceback.format_exc())
return jsonify({"error": str(e)}), 500
def delete_item():
"""Delete a single flagged item. Returns {ok, error}."""
if not state.connector:
return jsonify({"ok": False, "error": "not authenticated"}), 401
data = request.get_json() or {}
item_id = data.get("id", "")
source_type = data.get("source_type", "")
account_id = data.get("account_id", "") or "me"
drive_id = data.get("drive_id", "")
if not item_id:
return jsonify({"ok": False, "error": "id required"}), 400
try:
if source_type == "email":
ok = state.connector.delete_message(account_id, item_id)
elif drive_id:
ok = state.connector.delete_drive_item(drive_id, item_id)
else:
ok = state.connector.delete_drive_item_for_user(account_id, item_id)
if ok or ok is False: # False = already gone, treat as success
# Retrieve full item for audit log before removing it
item_meta = next((x for x in state.flagged_items if x.get("id") == item_id), {})
state.flagged_items = [x for x in state.flagged_items if x.get("id") != item_id]
_db = _get_db() if DB_OK else None
if _db:
try:
_db.log_deletion(item_meta or {"id": item_id, "source_type": source_type},
reason="manual")
_db.delete_item_record(item_id)
except Exception: pass
return jsonify({"ok": True})
return jsonify({"ok": False, "error": "Delete returned unexpected result"})
except M365PermissionError:
return jsonify({"ok": False, "error":
"Permission denied (403) — deletion requires Mail.ReadWrite / Files.ReadWrite.All / Sites.ReadWrite.All. "
"Go to Azure → App registrations → API permissions → add these and Grant admin consent."})
except Exception as e:
return jsonify({"ok": False, "error": str(e)})
@bp.route("/api/delete_bulk", methods=["POST"])
def delete_bulk():
"""Delete multiple items matching criteria. Streams progress as SSE."""
if not state.connector:
return jsonify({"ok": False, "error": "not authenticated"}), 401
data = request.get_json() or {}
item_ids = data.get("ids", []) # explicit list of ids, or empty = use filters
filters = data.get("filters", {})
del_reason = data.get("reason", "bulk") # manual/bulk/retention/data-subject-request
# Build target list
if item_ids:
targets = [x for x in state.flagged_items if x.get("id") in set(item_ids)]
else:
targets = list(state.flagged_items)
# Apply filters
if filters.get("source_type"):
targets = [x for x in targets if x.get("source_type") == filters["source_type"]]
if filters.get("min_cpr"):
targets = [x for x in targets if x.get("cpr_count", 0) >= int(filters["min_cpr"])]
if filters.get("older_than_date"):
targets = [x for x in targets if x.get("modified", "9999") <= filters["older_than_date"]]
deleted_ids = []
failed_items = []
for item in targets:
iid = item.get("id", "")
source_type = item.get("source_type", "")
account_id = item.get("account_id", "") or "me"
drive_id = item.get("drive_id", "")
try:
if source_type == "email":
state.connector.delete_message(account_id, iid)
elif drive_id:
state.connector.delete_drive_item(drive_id, iid)
else:
state.connector.delete_drive_item_for_user(account_id, iid)
deleted_ids.append(iid)
except M365PermissionError:
failed_items.append({"id": iid, "name": item.get("name", ""), "error":
"403 — requires Mail.ReadWrite / Files.ReadWrite.All / Sites.ReadWrite.All (Azure admin consent)"})
except Exception as e:
failed_items.append({"id": iid, "name": item.get("name", ""), "error": str(e)})
# Build id->item map for audit log
_deleted_meta = {x.get("id"): x for x in targets if x.get("id") in set(deleted_ids)}
state.flagged_items = [x for x in state.flagged_items if x.get("id") not in set(deleted_ids)]
_db = _get_db() if DB_OK else None
if _db:
for _did in deleted_ids:
try:
_db.log_deletion(_deleted_meta.get(_did, {"id": _did}), reason=del_reason)
_db.delete_item_record(_did)
except Exception: pass
return jsonify({
"ok": True,
"deleted": len(deleted_ids),
"failed": len(failed_items),
"errors": failed_items[:10], # cap error list
})