feat: video/audio metadata scanning, profile rename fix, route tests
- Scan .mp4/.mov/.avi/.mkv and .mp3/.flac/.ogg/.m4a/.wma (+ 7 more)
for GPS coordinates, artist/author, title, comment — metadata only,
no frame or audio analysis. Uses mutagen (added to requirements.txt).
GPS-tagged phone recordings now flag with gps_location like photos.
- Fix _extract_audio_metadata silently returning empty results:
mutagen.File() first positional arg is `filename`, not `fileobj` —
was passing BytesIO as the filename. Fixed to keyword args.
- Fix profile copy rename not reflected in left column until modal
reopen: _pmgmtSaveFullEdit called loadProfiles() but never
_renderProfileMgmt(). Added re-render and active-row highlight.
- Add TestProfileRoutes (10 tests) covering all profile API endpoints
including a rename regression test. Total: 182 tests.
- generate_fixtures.py now produces 6 audio/video fixtures (14–19):
2 MP3, 2 FLAC, 2 MP4 — 4 flagged, 2 negative cases.
348 lines
14 KiB
Python
348 lines
14 KiB
Python
"""
|
||
Generate binary fixture files for the local-file GDPR scan test suite.
|
||
|
||
Run from repo root:
|
||
source venv/bin/activate
|
||
python tests/fixtures/local_files/generate_fixtures.py
|
||
|
||
Fixtures produced
|
||
─────────────────
|
||
Document fixtures (require python-docx + openpyxl):
|
||
09_cpr_in_docx.docx — Word document with 2 CPR numbers → Flag
|
||
13_cpr_in_xlsx.xlsx — Excel workbook with CPR numbers → Flag
|
||
|
||
Audio fixtures (require mutagen):
|
||
14_audio_artist_pii.mp3 — MP3 with artist/title tags (personal name) → Flag
|
||
15_audio_artist_pii.flac — FLAC with artist/title Vorbis comments → Flag
|
||
16_audio_no_pii.mp3 — MP3 with no metadata tags → No flag
|
||
17_audio_no_pii.flac — FLAC with no metadata → No flag
|
||
|
||
Video fixtures (require mutagen):
|
||
18_video_gps.mp4 — MP4 with GPS coordinates + artist tag → Flag
|
||
19_video_no_pii.mp4 — MP4 with no metadata tags → No flag
|
||
"""
|
||
import struct
|
||
import tempfile
|
||
import os
|
||
from pathlib import Path
|
||
import sys
|
||
|
||
HERE = Path(__file__).parent
|
||
|
||
def _require(pkg):
|
||
try:
|
||
return __import__(pkg)
|
||
except ImportError:
|
||
print(f"Missing: {pkg} → pip install {pkg}", file=sys.stderr)
|
||
sys.exit(1)
|
||
|
||
openpyxl = _require("openpyxl")
|
||
docx = _require("docx")
|
||
_require("mutagen")
|
||
|
||
from openpyxl import Workbook
|
||
from openpyxl.styles import Font, PatternFill, Alignment
|
||
from docx import Document
|
||
from docx.shared import Pt, RGBColor
|
||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||
|
||
|
||
# ── 09_cpr_in_docx.docx ───────────────────────────────────────────────────────
|
||
def make_docx():
|
||
doc = Document()
|
||
|
||
doc.add_heading("Elevjournal — Gudenaaskolen", level=1)
|
||
|
||
p = doc.add_paragraph()
|
||
p.add_run("Dette dokument indeholder personoplysninger og er fortroligt.")
|
||
p.runs[0].italic = True
|
||
|
||
doc.add_heading("Elevoplysninger", level=2)
|
||
# Use labelled paragraphs so CPR values are always preceded by ": " —
|
||
# avoids the _CPR_PREFIX_NOISE guard that fires when table-cell runs are
|
||
# concatenated without a separator.
|
||
fields = [
|
||
("Navn", "Magnus Lund Eriksen"),
|
||
("CPR-nummer", "010172-1019"),
|
||
("Klasse", "8B"),
|
||
("Adresse", "Egevej 3, 8680 Ry"),
|
||
("Telefon", "+45 40 12 34 56"),
|
||
("E-mail", "magnus.eriksen@elev.gudenaaskolen.dk"),
|
||
]
|
||
for label, value in fields:
|
||
p = doc.add_paragraph()
|
||
run_label = p.add_run(f"{label}: ")
|
||
run_label.bold = True
|
||
p.add_run(value + " ")
|
||
|
||
doc.add_heading("Forældrekontakt", level=2)
|
||
doc.add_paragraph(
|
||
"Forældrene er orienteret om elevens situation den 15. marts 2026. "
|
||
"Begge forældre deltog i mødet. Næste opfølgning er planlagt til "
|
||
"maj 2026."
|
||
)
|
||
|
||
doc.add_heading("Anden elev — tabel", level=2)
|
||
doc.add_paragraph(
|
||
"Nedenstående tabel viser en anden elev, der deler klasse med Magnus."
|
||
)
|
||
for label, value in [
|
||
("Navn", "Nora Bjerrum Nielsen"),
|
||
("Personnummer", "280490-0120"),
|
||
("Klasse", "8B"),
|
||
]:
|
||
p = doc.add_paragraph()
|
||
p.add_run(f"{label}: ").bold = True
|
||
p.add_run(value + " ")
|
||
|
||
doc.add_heading("Sagsbehandlernote", level=2)
|
||
doc.add_paragraph(
|
||
"Sagsbehandler: M. Andersen\n"
|
||
"Dato: 20. april 2026\n"
|
||
"Der er ikke fundet grundlag for yderligere foranstaltninger."
|
||
)
|
||
|
||
out = HERE / "09_cpr_in_docx.docx"
|
||
doc.save(str(out))
|
||
print(f"Written: {out.name}")
|
||
|
||
|
||
# ── 13_cpr_in_xlsx.xlsx ───────────────────────────────────────────────────────
|
||
def make_xlsx():
|
||
wb = Workbook()
|
||
|
||
# Sheet 1: Elevliste
|
||
ws1 = wb.active
|
||
ws1.title = "Elevliste"
|
||
|
||
header_font = Font(bold=True, color="FFFFFF")
|
||
header_fill = PatternFill("solid", fgColor="2B5F9E")
|
||
|
||
headers = ["Klasse", "Navn", "CPR-nummer", "Adresse", "Forælder tlf", "Bemærkninger"]
|
||
for col, h in enumerate(headers, 1):
|
||
cell = ws1.cell(row=1, column=col, value=h)
|
||
cell.font = header_font
|
||
cell.fill = header_fill
|
||
cell.alignment = Alignment(horizontal="center")
|
||
|
||
students = [
|
||
("7A", "Magnus Lund Eriksen", "010172-1019", "Egevej 3, 8680 Ry", "+45 40 12 34 56", ""),
|
||
("7A", "Nora Bjerrum Nielsen", "280490-0120", "Møllevej 11, 8680 Ry", "+45 50 23 45 67", "Brillebærer"),
|
||
("7A", "Oliver Skov Madsen", "250372-0100", "Kirkegade 2, 8660 Skanderborg", "+45 60 34 56 78", ""),
|
||
("7B", "Rasmus Dal Kristensen", "150365-1102", "Rosenvej 5, 8680 Ry", "+45 21 56 78 90", ""),
|
||
("7B", "Sofie Holm Thomsen", "111111-1010", "Birkevej 22, 8660 Skanderborg", "+45 31 67 89 01", "Allergi: nødder"),
|
||
("7B", "Emil Sand Jensen", "010107-4102", "Hybenvej 7, 8680 Ry", "+45 41 78 90 12", ""),
|
||
]
|
||
for row_i, row_data in enumerate(students, 2):
|
||
for col_i, val in enumerate(row_data, 1):
|
||
ws1.cell(row=row_i, column=col_i, value=val)
|
||
|
||
for col in ws1.columns:
|
||
max_len = max(len(str(c.value or "")) for c in col)
|
||
ws1.column_dimensions[col[0].column_letter].width = max_len + 4
|
||
|
||
# Sheet 2: Medarbejdere
|
||
ws2 = wb.create_sheet("Medarbejdere")
|
||
emp_headers = ["ID", "Navn", "Personnummer", "Afdeling", "E-mail"]
|
||
for col, h in enumerate(emp_headers, 1):
|
||
cell = ws2.cell(row=1, column=col, value=h)
|
||
cell.font = header_font
|
||
cell.fill = header_fill
|
||
cell.alignment = Alignment(horizontal="center")
|
||
|
||
employees = [
|
||
("EMP-001", "Christian Bøgh Hansen", "150365-1102", "Ledelse", "c.hansen@gudenaaskolen.dk"),
|
||
("EMP-002", "Mette Dahl Andersen", "280490-0120", "Administration", "m.andersen@gudenaaskolen.dk"),
|
||
("EMP-003", "Søren Lykke Jakobsen", "010172-1019", "Pædagogik", "s.jakobsen@gudenaaskolen.dk"),
|
||
]
|
||
for row_i, row_data in enumerate(employees, 2):
|
||
for col_i, val in enumerate(row_data, 1):
|
||
ws2.cell(row=row_i, column=col_i, value=val)
|
||
|
||
for col in ws2.columns:
|
||
max_len = max(len(str(c.value or "")) for c in col)
|
||
ws2.column_dimensions[col[0].column_letter].width = max_len + 4
|
||
|
||
out = HERE / "13_cpr_in_xlsx.xlsx"
|
||
wb.save(str(out))
|
||
print(f"Written: {out.name}")
|
||
|
||
|
||
# ── Audio / video helpers ─────────────────────────────────────────────────────
|
||
|
||
# Two silent MPEG1 Layer3 frames (128 kbps / 44100 Hz / mono).
|
||
# mutagen needs at least 2 consecutive frame headers to confirm sync.
|
||
# 4-byte header + 413 bytes frame body = 417 bytes × 2 = 834 bytes total.
|
||
_MPEG_FRAMES = (b'\xff\xfb\x90\x00' + b'\x00' * 413) * 2
|
||
|
||
|
||
def _flac_block_header(block_type: int, data_len: int, last: bool = False) -> bytes:
|
||
first = (0x80 if last else 0x00) | block_type
|
||
return bytes([first, (data_len >> 16) & 0xFF, (data_len >> 8) & 0xFF, data_len & 0xFF])
|
||
|
||
|
||
def _vorbis_comment_block(comments: dict) -> bytes:
|
||
vendor = b'GDPRScanner fixture'
|
||
data = struct.pack('<I', len(vendor)) + vendor
|
||
data += struct.pack('<I', len(comments))
|
||
for key, value in comments.items():
|
||
entry = f'{key}={value}'.encode('utf-8')
|
||
data += struct.pack('<I', len(entry)) + entry
|
||
return data
|
||
|
||
|
||
def _minimal_flac(comments: dict) -> bytes:
|
||
"""Return bytes for a valid minimal FLAC file with Vorbis comments."""
|
||
# STREAMINFO (34 bytes): 44100 Hz, mono, 16-bit, 0 samples, zero MD5.
|
||
si = bytearray(34)
|
||
si[0:2] = struct.pack('>H', 4096) # min block size
|
||
si[2:4] = struct.pack('>H', 4096) # max block size
|
||
# bytes 4-9: min/max frame sizes = 0 (unknown)
|
||
# Bits 80-99: sample_rate=44100 (0xAC44 in 20-bit field)
|
||
# Bits 100-102: channels-1 = 0 (mono)
|
||
# Bits 103-107: bits_per_sample-1 = 15 (16-bit)
|
||
# Bits 108-143: total_samples = 0; bytes 14-17 remain zero
|
||
si[10] = 0x0A # 0000_1010 — top 8 of 44100 in 20-bit field
|
||
si[11] = 0xC4 # 1100_0100
|
||
si[12] = 0x40 # bottom 4 of sample_rate | channels(000) | bps_msb(0)
|
||
si[13] = 0xF0 # bps remaining 4 bits (1111) | top 4 of total_samples (0)
|
||
|
||
vc = _vorbis_comment_block(comments)
|
||
return (
|
||
b'fLaC'
|
||
+ _flac_block_header(0, 34, last=not comments) # STREAMINFO
|
||
+ bytes(si)
|
||
+ (_flac_block_header(4, len(vc), last=True) + vc if comments else b'')
|
||
)
|
||
|
||
|
||
def _mp4_atom(name: bytes, data: bytes) -> bytes:
|
||
return struct.pack('>I', 8 + len(data)) + name + data
|
||
|
||
|
||
def _minimal_mp4_base() -> bytes:
|
||
"""Return bytes for the smallest valid MPEG-4 container mutagen can tag."""
|
||
# ftyp — identifies the file as M4A
|
||
ftyp = _mp4_atom(
|
||
b'ftyp',
|
||
b'M4A ' + struct.pack('>I', 0) + b'M4A ' + b'mp42' + b'isom',
|
||
)
|
||
# mvhd version 0 — 100 bytes of content (ISO 14496-12 §8.2.2)
|
||
mvhd = bytearray(100)
|
||
mvhd[0:4] = b'\x00\x00\x00\x00' # version + flags
|
||
struct.pack_into('>IIII', mvhd, 4, 0, 0, 1000, 0) # creation, modification, timescale, duration
|
||
struct.pack_into('>I', mvhd, 16, 0x00010000) # rate = 1.0
|
||
struct.pack_into('>H', mvhd, 20, 0x0100) # volume = 1.0
|
||
# bytes 22-31: reserved (10 bytes, already zero)
|
||
struct.pack_into('>9i', mvhd, 32, # unity matrix
|
||
0x00010000, 0, 0, 0, 0x00010000, 0, 0, 0, 0x40000000)
|
||
# bytes 68-91: pre-defined (24 bytes, already zero)
|
||
struct.pack_into('>I', mvhd, 96, 0xFFFFFFFF) # next_track_ID
|
||
|
||
return ftyp + _mp4_atom(b'moov', _mp4_atom(b'mvhd', bytes(mvhd)))
|
||
|
||
|
||
def _mp4_with_tags(tags: dict) -> bytes:
|
||
"""Return bytes for a minimal MP4 with the given mutagen tag dict."""
|
||
import mutagen.mp4
|
||
tmp = tempfile.mktemp(suffix='.mp4')
|
||
try:
|
||
with open(tmp, 'wb') as fh:
|
||
fh.write(_minimal_mp4_base())
|
||
f = mutagen.mp4.MP4(tmp)
|
||
f.add_tags()
|
||
for key, value in tags.items():
|
||
f.tags[key] = [value]
|
||
f.save()
|
||
with open(tmp, 'rb') as fh:
|
||
return fh.read()
|
||
finally:
|
||
if os.path.exists(tmp):
|
||
os.unlink(tmp)
|
||
|
||
|
||
# ── 14_audio_artist_pii.mp3 ───────────────────────────────────────────────────
|
||
def make_mp3_pii():
|
||
from mutagen.easyid3 import EasyID3
|
||
tmp = tempfile.mktemp(suffix='.mp3')
|
||
try:
|
||
t = EasyID3()
|
||
t['artist'] = ['Emma Slot Henriksen']
|
||
t['title'] = ['Fortrolig optagelse — personalemøde']
|
||
t['date'] = ['2026-04-21']
|
||
t.save(tmp)
|
||
with open(tmp, 'rb') as fh:
|
||
id3_bytes = fh.read()
|
||
finally:
|
||
if os.path.exists(tmp):
|
||
os.unlink(tmp)
|
||
|
||
out = HERE / '14_audio_artist_pii.mp3'
|
||
out.write_bytes(id3_bytes + _MPEG_FRAMES)
|
||
print(f"Written: {out.name}")
|
||
|
||
|
||
# ── 15_audio_artist_pii.flac ──────────────────────────────────────────────────
|
||
def make_flac_pii():
|
||
out = HERE / '15_audio_artist_pii.flac'
|
||
out.write_bytes(_minimal_flac({
|
||
'ARTIST': 'Emma Slot Henriksen',
|
||
'TITLE': 'Fortrolig optagelse — personalemøde',
|
||
'DATE': '2026-04-21',
|
||
}))
|
||
print(f"Written: {out.name}")
|
||
|
||
|
||
# ── 16_audio_no_pii.mp3 ───────────────────────────────────────────────────────
|
||
def make_mp3_no_pii():
|
||
from mutagen.easyid3 import EasyID3
|
||
tmp = tempfile.mktemp(suffix='.mp3')
|
||
try:
|
||
EasyID3().save(tmp) # empty ID3 header, no tags
|
||
with open(tmp, 'rb') as fh:
|
||
id3_bytes = fh.read()
|
||
finally:
|
||
if os.path.exists(tmp):
|
||
os.unlink(tmp)
|
||
|
||
out = HERE / '16_audio_no_pii.mp3'
|
||
out.write_bytes(id3_bytes + _MPEG_FRAMES)
|
||
print(f"Written: {out.name}")
|
||
|
||
|
||
# ── 17_audio_no_pii.flac ──────────────────────────────────────────────────────
|
||
def make_flac_no_pii():
|
||
out = HERE / '17_audio_no_pii.flac'
|
||
out.write_bytes(_minimal_flac({})) # no Vorbis comment block
|
||
print(f"Written: {out.name}")
|
||
|
||
|
||
# ── 18_video_gps.mp4 ─────────────────────────────────────────────────────────
|
||
def make_mp4_gps():
|
||
out = HERE / '18_video_gps.mp4'
|
||
out.write_bytes(_mp4_with_tags({
|
||
'©xyz': '+55.6761+012.5683+000.000/', # Copenhagen
|
||
'©ART': 'Emma Slot Henriksen',
|
||
'©nam': 'Optagelse fra skolegården',
|
||
}))
|
||
print(f"Written: {out.name}")
|
||
|
||
|
||
# ── 19_video_no_pii.mp4 ──────────────────────────────────────────────────────
|
||
def make_mp4_no_pii():
|
||
out = HERE / '19_video_no_pii.mp4'
|
||
out.write_bytes(_minimal_mp4_base()) # no moov/udta/meta/ilst — no tags
|
||
print(f"Written: {out.name}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
make_docx()
|
||
make_xlsx()
|
||
make_mp3_pii()
|
||
make_flac_pii()
|
||
make_mp3_no_pii()
|
||
make_flac_no_pii()
|
||
make_mp4_gps()
|
||
make_mp4_no_pii()
|
||
print("Done.")
|