""" Generate binary fixture files for the local-file GDPR scan test suite. Run from repo root: source venv/bin/activate python tests/fixtures/local_files/generate_fixtures.py Fixtures produced ───────────────── Document fixtures (require python-docx + openpyxl): 09_cpr_in_docx.docx — Word document with 2 CPR numbers → Flag 13_cpr_in_xlsx.xlsx — Excel workbook with CPR numbers → Flag Audio fixtures (require mutagen): 14_audio_artist_pii.mp3 — MP3 with artist/title tags (personal name) → Flag 15_audio_artist_pii.flac — FLAC with artist/title Vorbis comments → Flag 16_audio_no_pii.mp3 — MP3 with no metadata tags → No flag 17_audio_no_pii.flac — FLAC with no metadata → No flag Video fixtures (require mutagen): 18_video_gps.mp4 — MP4 with GPS coordinates + artist tag → Flag 19_video_no_pii.mp4 — MP4 with no metadata tags → No flag """ import struct import tempfile import os from pathlib import Path import sys HERE = Path(__file__).parent def _require(pkg): try: return __import__(pkg) except ImportError: print(f"Missing: {pkg} → pip install {pkg}", file=sys.stderr) sys.exit(1) openpyxl = _require("openpyxl") docx = _require("docx") _require("mutagen") from openpyxl import Workbook from openpyxl.styles import Font, PatternFill, Alignment from docx import Document from docx.shared import Pt, RGBColor from docx.enum.text import WD_ALIGN_PARAGRAPH # ── 09_cpr_in_docx.docx ─────────────────────────────────────────────────────── def make_docx(): doc = Document() doc.add_heading("Elevjournal — Gudenaaskolen", level=1) p = doc.add_paragraph() p.add_run("Dette dokument indeholder personoplysninger og er fortroligt.") p.runs[0].italic = True doc.add_heading("Elevoplysninger", level=2) # Use labelled paragraphs so CPR values are always preceded by ": " — # avoids the _CPR_PREFIX_NOISE guard that fires when table-cell runs are # concatenated without a separator. fields = [ ("Navn", "Magnus Lund Eriksen"), ("CPR-nummer", "010172-1019"), ("Klasse", "8B"), ("Adresse", "Egevej 3, 8680 Ry"), ("Telefon", "+45 40 12 34 56"), ("E-mail", "magnus.eriksen@elev.gudenaaskolen.dk"), ] for label, value in fields: p = doc.add_paragraph() run_label = p.add_run(f"{label}: ") run_label.bold = True p.add_run(value + " ") doc.add_heading("Forældrekontakt", level=2) doc.add_paragraph( "Forældrene er orienteret om elevens situation den 15. marts 2026. " "Begge forældre deltog i mødet. Næste opfølgning er planlagt til " "maj 2026." ) doc.add_heading("Anden elev — tabel", level=2) doc.add_paragraph( "Nedenstående tabel viser en anden elev, der deler klasse med Magnus." ) for label, value in [ ("Navn", "Nora Bjerrum Nielsen"), ("Personnummer", "280490-0120"), ("Klasse", "8B"), ]: p = doc.add_paragraph() p.add_run(f"{label}: ").bold = True p.add_run(value + " ") doc.add_heading("Sagsbehandlernote", level=2) doc.add_paragraph( "Sagsbehandler: M. Andersen\n" "Dato: 20. april 2026\n" "Der er ikke fundet grundlag for yderligere foranstaltninger." ) out = HERE / "09_cpr_in_docx.docx" doc.save(str(out)) print(f"Written: {out.name}") # ── 13_cpr_in_xlsx.xlsx ─────────────────────────────────────────────────────── def make_xlsx(): wb = Workbook() # Sheet 1: Elevliste ws1 = wb.active ws1.title = "Elevliste" header_font = Font(bold=True, color="FFFFFF") header_fill = PatternFill("solid", fgColor="2B5F9E") headers = ["Klasse", "Navn", "CPR-nummer", "Adresse", "Forælder tlf", "Bemærkninger"] for col, h in enumerate(headers, 1): cell = ws1.cell(row=1, column=col, value=h) cell.font = header_font cell.fill = header_fill cell.alignment = Alignment(horizontal="center") students = [ ("7A", "Magnus Lund Eriksen", "010172-1019", "Egevej 3, 8680 Ry", "+45 40 12 34 56", ""), ("7A", "Nora Bjerrum Nielsen", "280490-0120", "Møllevej 11, 8680 Ry", "+45 50 23 45 67", "Brillebærer"), ("7A", "Oliver Skov Madsen", "250372-0100", "Kirkegade 2, 8660 Skanderborg", "+45 60 34 56 78", ""), ("7B", "Rasmus Dal Kristensen", "150365-1102", "Rosenvej 5, 8680 Ry", "+45 21 56 78 90", ""), ("7B", "Sofie Holm Thomsen", "111111-1010", "Birkevej 22, 8660 Skanderborg", "+45 31 67 89 01", "Allergi: nødder"), ("7B", "Emil Sand Jensen", "010107-4102", "Hybenvej 7, 8680 Ry", "+45 41 78 90 12", ""), ] for row_i, row_data in enumerate(students, 2): for col_i, val in enumerate(row_data, 1): ws1.cell(row=row_i, column=col_i, value=val) for col in ws1.columns: max_len = max(len(str(c.value or "")) for c in col) ws1.column_dimensions[col[0].column_letter].width = max_len + 4 # Sheet 2: Medarbejdere ws2 = wb.create_sheet("Medarbejdere") emp_headers = ["ID", "Navn", "Personnummer", "Afdeling", "E-mail"] for col, h in enumerate(emp_headers, 1): cell = ws2.cell(row=1, column=col, value=h) cell.font = header_font cell.fill = header_fill cell.alignment = Alignment(horizontal="center") employees = [ ("EMP-001", "Christian Bøgh Hansen", "150365-1102", "Ledelse", "c.hansen@gudenaaskolen.dk"), ("EMP-002", "Mette Dahl Andersen", "280490-0120", "Administration", "m.andersen@gudenaaskolen.dk"), ("EMP-003", "Søren Lykke Jakobsen", "010172-1019", "Pædagogik", "s.jakobsen@gudenaaskolen.dk"), ] for row_i, row_data in enumerate(employees, 2): for col_i, val in enumerate(row_data, 1): ws2.cell(row=row_i, column=col_i, value=val) for col in ws2.columns: max_len = max(len(str(c.value or "")) for c in col) ws2.column_dimensions[col[0].column_letter].width = max_len + 4 out = HERE / "13_cpr_in_xlsx.xlsx" wb.save(str(out)) print(f"Written: {out.name}") # ── Audio / video helpers ───────────────────────────────────────────────────── # Two silent MPEG1 Layer3 frames (128 kbps / 44100 Hz / mono). # mutagen needs at least 2 consecutive frame headers to confirm sync. # 4-byte header + 413 bytes frame body = 417 bytes × 2 = 834 bytes total. _MPEG_FRAMES = (b'\xff\xfb\x90\x00' + b'\x00' * 413) * 2 def _flac_block_header(block_type: int, data_len: int, last: bool = False) -> bytes: first = (0x80 if last else 0x00) | block_type return bytes([first, (data_len >> 16) & 0xFF, (data_len >> 8) & 0xFF, data_len & 0xFF]) def _vorbis_comment_block(comments: dict) -> bytes: vendor = b'GDPRScanner fixture' data = struct.pack(' bytes: """Return bytes for a valid minimal FLAC file with Vorbis comments.""" # STREAMINFO (34 bytes): 44100 Hz, mono, 16-bit, 0 samples, zero MD5. si = bytearray(34) si[0:2] = struct.pack('>H', 4096) # min block size si[2:4] = struct.pack('>H', 4096) # max block size # bytes 4-9: min/max frame sizes = 0 (unknown) # Bits 80-99: sample_rate=44100 (0xAC44 in 20-bit field) # Bits 100-102: channels-1 = 0 (mono) # Bits 103-107: bits_per_sample-1 = 15 (16-bit) # Bits 108-143: total_samples = 0; bytes 14-17 remain zero si[10] = 0x0A # 0000_1010 — top 8 of 44100 in 20-bit field si[11] = 0xC4 # 1100_0100 si[12] = 0x40 # bottom 4 of sample_rate | channels(000) | bps_msb(0) si[13] = 0xF0 # bps remaining 4 bits (1111) | top 4 of total_samples (0) vc = _vorbis_comment_block(comments) return ( b'fLaC' + _flac_block_header(0, 34, last=not comments) # STREAMINFO + bytes(si) + (_flac_block_header(4, len(vc), last=True) + vc if comments else b'') ) def _mp4_atom(name: bytes, data: bytes) -> bytes: return struct.pack('>I', 8 + len(data)) + name + data def _minimal_mp4_base() -> bytes: """Return bytes for the smallest valid MPEG-4 container mutagen can tag.""" # ftyp — identifies the file as M4A ftyp = _mp4_atom( b'ftyp', b'M4A ' + struct.pack('>I', 0) + b'M4A ' + b'mp42' + b'isom', ) # mvhd version 0 — 100 bytes of content (ISO 14496-12 §8.2.2) mvhd = bytearray(100) mvhd[0:4] = b'\x00\x00\x00\x00' # version + flags struct.pack_into('>IIII', mvhd, 4, 0, 0, 1000, 0) # creation, modification, timescale, duration struct.pack_into('>I', mvhd, 16, 0x00010000) # rate = 1.0 struct.pack_into('>H', mvhd, 20, 0x0100) # volume = 1.0 # bytes 22-31: reserved (10 bytes, already zero) struct.pack_into('>9i', mvhd, 32, # unity matrix 0x00010000, 0, 0, 0, 0x00010000, 0, 0, 0, 0x40000000) # bytes 68-91: pre-defined (24 bytes, already zero) struct.pack_into('>I', mvhd, 96, 0xFFFFFFFF) # next_track_ID return ftyp + _mp4_atom(b'moov', _mp4_atom(b'mvhd', bytes(mvhd))) def _mp4_with_tags(tags: dict) -> bytes: """Return bytes for a minimal MP4 with the given mutagen tag dict.""" import mutagen.mp4 tmp = tempfile.mktemp(suffix='.mp4') try: with open(tmp, 'wb') as fh: fh.write(_minimal_mp4_base()) f = mutagen.mp4.MP4(tmp) f.add_tags() for key, value in tags.items(): f.tags[key] = [value] f.save() with open(tmp, 'rb') as fh: return fh.read() finally: if os.path.exists(tmp): os.unlink(tmp) # ── 14_audio_artist_pii.mp3 ─────────────────────────────────────────────────── def make_mp3_pii(): from mutagen.easyid3 import EasyID3 tmp = tempfile.mktemp(suffix='.mp3') try: t = EasyID3() t['artist'] = ['Emma Slot Henriksen'] t['title'] = ['Fortrolig optagelse — personalemøde'] t['date'] = ['2026-04-21'] t.save(tmp) with open(tmp, 'rb') as fh: id3_bytes = fh.read() finally: if os.path.exists(tmp): os.unlink(tmp) out = HERE / '14_audio_artist_pii.mp3' out.write_bytes(id3_bytes + _MPEG_FRAMES) print(f"Written: {out.name}") # ── 15_audio_artist_pii.flac ────────────────────────────────────────────────── def make_flac_pii(): out = HERE / '15_audio_artist_pii.flac' out.write_bytes(_minimal_flac({ 'ARTIST': 'Emma Slot Henriksen', 'TITLE': 'Fortrolig optagelse — personalemøde', 'DATE': '2026-04-21', })) print(f"Written: {out.name}") # ── 16_audio_no_pii.mp3 ─────────────────────────────────────────────────────── def make_mp3_no_pii(): from mutagen.easyid3 import EasyID3 tmp = tempfile.mktemp(suffix='.mp3') try: EasyID3().save(tmp) # empty ID3 header, no tags with open(tmp, 'rb') as fh: id3_bytes = fh.read() finally: if os.path.exists(tmp): os.unlink(tmp) out = HERE / '16_audio_no_pii.mp3' out.write_bytes(id3_bytes + _MPEG_FRAMES) print(f"Written: {out.name}") # ── 17_audio_no_pii.flac ────────────────────────────────────────────────────── def make_flac_no_pii(): out = HERE / '17_audio_no_pii.flac' out.write_bytes(_minimal_flac({})) # no Vorbis comment block print(f"Written: {out.name}") # ── 18_video_gps.mp4 ───────────────────────────────────────────────────────── def make_mp4_gps(): out = HERE / '18_video_gps.mp4' out.write_bytes(_mp4_with_tags({ '©xyz': '+55.6761+012.5683+000.000/', # Copenhagen '©ART': 'Emma Slot Henriksen', '©nam': 'Optagelse fra skolegården', })) print(f"Written: {out.name}") # ── 19_video_no_pii.mp4 ────────────────────────────────────────────────────── def make_mp4_no_pii(): out = HERE / '19_video_no_pii.mp4' out.write_bytes(_minimal_mp4_base()) # no moov/udta/meta/ilst — no tags print(f"Written: {out.name}") if __name__ == "__main__": make_docx() make_xlsx() make_mp3_pii() make_flac_pii() make_mp3_no_pii() make_flac_no_pii() make_mp4_gps() make_mp4_no_pii() print("Done.")