916 lines
35 KiB
Python
916 lines
35 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
google_connector.py — Google Workspace connector for GDPR Scanner.
|
|
|
|
Handles service-account authentication with domain-wide delegation and exposes
|
|
iterators for:
|
|
- Gmail messages (body + attachments) via the Gmail API
|
|
- Google Drive files (with export for native Docs/Sheets/Slides) via Drive API
|
|
|
|
All file content is yielded as (metadata_dict, bytes_content) tuples, matching
|
|
the same contract used by m365_connector so the scan engine can reuse _scan_bytes.
|
|
|
|
Authentication:
|
|
Service account JSON key with domain-wide delegation enabled in Google Workspace
|
|
Admin Console → Security → API Controls → Domain-wide delegation.
|
|
|
|
Required OAuth scopes (add to the service account's delegation entry):
|
|
https://www.googleapis.com/auth/gmail.readonly
|
|
https://www.googleapis.com/auth/drive.readonly
|
|
https://www.googleapis.com/auth/admin.directory.user.readonly (user listing)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import base64
|
|
import io
|
|
import json
|
|
import logging
|
|
import time
|
|
import threading
|
|
from pathlib import Path
|
|
from typing import Iterator, Optional
|
|
|
|
# ── google-auth / google-api-python-client ────────────────────────────────────
|
|
try:
|
|
from google.oauth2 import service_account
|
|
from googleapiclient.discovery import build
|
|
from googleapiclient.errors import HttpError
|
|
from googleapiclient.http import MediaIoBaseDownload
|
|
GOOGLE_AUTH_OK = True
|
|
|
|
# Suppress the googleapiclient.http WARNING that fires before raising
|
|
# HttpError for exportSizeLimitExceeded — we handle it ourselves below.
|
|
class _SuppressExportSizeWarning(logging.Filter):
|
|
def filter(self, record: logging.LogRecord) -> bool:
|
|
return "exportSizeLimitExceeded" not in record.getMessage()
|
|
|
|
logging.getLogger("googleapiclient.http").addFilter(_SuppressExportSizeWarning())
|
|
|
|
except ImportError:
|
|
GOOGLE_AUTH_OK = False
|
|
|
|
_DATA_DIR = Path.home() / ".gdprscanner"
|
|
_DATA_DIR.mkdir(exist_ok=True)
|
|
_SA_KEY_FILE = _DATA_DIR / "google_sa.json"
|
|
_GOOGLE_TOKEN_FILE = _DATA_DIR / "google_token.json"
|
|
|
|
PERSONAL_SCOPES = [
|
|
"https://www.googleapis.com/auth/gmail.readonly",
|
|
"https://www.googleapis.com/auth/drive.readonly",
|
|
]
|
|
_DEVICE_AUTH_URL = "https://oauth2.googleapis.com/device/code"
|
|
_TOKEN_URL = "https://oauth2.googleapis.com/token"
|
|
_USERINFO_URL = "https://www.googleapis.com/oauth2/v2/userinfo"
|
|
_DEVICE_GRANT = "urn:ietf:params:oauth:grant-type:device_code"
|
|
|
|
GMAIL_SCOPES = [
|
|
"https://www.googleapis.com/auth/gmail.readonly",
|
|
]
|
|
DRIVE_SCOPES = [
|
|
"https://www.googleapis.com/auth/drive.readonly",
|
|
]
|
|
DRIVE_WRITE_SCOPES = [
|
|
"https://www.googleapis.com/auth/drive",
|
|
]
|
|
ADMIN_SCOPES = [
|
|
"https://www.googleapis.com/auth/admin.directory.user.readonly",
|
|
]
|
|
|
|
# Google-native MIME types and the export format we request
|
|
_EXPORT_MAP = {
|
|
"application/vnd.google-apps.document": ("application/vnd.openxmlformats-officedocument.wordprocessingml.document", ".docx"),
|
|
"application/vnd.google-apps.spreadsheet": ("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", ".xlsx"),
|
|
"application/vnd.google-apps.presentation": ("application/vnd.openxmlformats-officedocument.presentationml.presentation", ".pptx"),
|
|
"application/vnd.google-apps.drawing": ("application/pdf", ".pdf"),
|
|
"application/vnd.google-apps.form": ("application/pdf", ".pdf"),
|
|
}
|
|
|
|
# Maximum export size for native Google files (bytes) — skip larger ones
|
|
_MAX_EXPORT_BYTES = 20 * 1024 * 1024 # 20 MB
|
|
|
|
# ── OU role mapping ───────────────────────────────────────────────────────────
|
|
_OU_ROLES_PATH = Path(__file__).parent / "classification" / "google_ou_roles.json"
|
|
|
|
def _load_ou_roles() -> tuple[list, list]:
|
|
"""Load student/staff OU prefix lists from skus/google_ou_roles.json.
|
|
Returns (student_prefixes, staff_prefixes) — both lowercased."""
|
|
try:
|
|
import json as _j
|
|
data = _j.loads(_OU_ROLES_PATH.read_text(encoding="utf-8"))
|
|
students = [p.lower() for p in data.get("student_ou_prefixes", [])]
|
|
staff = [p.lower() for p in data.get("staff_ou_prefixes", [])]
|
|
return students, staff
|
|
except Exception:
|
|
return ["/elever", "/students"], ["/personale", "/staff", "/lærere", "/ansatte"]
|
|
|
|
def classify_ou_role(org_unit_path: str) -> str:
|
|
"""Return 'student', 'staff', or 'other' based on orgUnitPath prefix."""
|
|
if not org_unit_path:
|
|
return "other"
|
|
path_lower = org_unit_path.lower()
|
|
students, staff = _load_ou_roles()
|
|
for prefix in students:
|
|
if path_lower.startswith(prefix):
|
|
return "student"
|
|
for prefix in staff:
|
|
if path_lower.startswith(prefix):
|
|
return "staff"
|
|
return "other"
|
|
|
|
|
|
|
|
class GoogleError(Exception):
|
|
pass
|
|
|
|
|
|
class GoogleConnector:
|
|
"""
|
|
Wraps service-account + domain-wide delegation auth for Gmail and Drive.
|
|
|
|
Usage:
|
|
conn = GoogleConnector(key_dict, admin_email="admin@domain.com")
|
|
for meta, data in conn.iter_gmail_messages("user@domain.com"):
|
|
...
|
|
"""
|
|
|
|
def __init__(self, key_dict: dict, admin_email: str = ""):
|
|
if not GOOGLE_AUTH_OK:
|
|
raise GoogleError(
|
|
"google-auth not installed — run: "
|
|
"pip install google-auth google-auth-httplib2 google-api-python-client"
|
|
)
|
|
self._key_dict = key_dict
|
|
self._admin_email = admin_email.strip()
|
|
self._lock = threading.Lock()
|
|
# Validate the key looks sane
|
|
if key_dict.get("type") != "service_account":
|
|
raise GoogleError("Key file must be a service_account JSON — found type: " + str(key_dict.get("type")))
|
|
|
|
# ── Credential factories ──────────────────────────────────────────────────
|
|
|
|
def _creds_for(self, user_email: str, scopes: list):
|
|
"""Return delegated credentials impersonating user_email."""
|
|
base = service_account.Credentials.from_service_account_info(
|
|
self._key_dict, scopes=scopes
|
|
)
|
|
return base.with_subject(user_email)
|
|
|
|
def _admin_creds(self):
|
|
"""Admin Directory API credentials (impersonating admin_email)."""
|
|
if not self._admin_email:
|
|
raise GoogleError("admin_email required to list workspace users")
|
|
return self._creds_for(self._admin_email, ADMIN_SCOPES + GMAIL_SCOPES + DRIVE_SCOPES)
|
|
|
|
# ── Connectivity check ────────────────────────────────────────────────────
|
|
|
|
def is_authenticated(self) -> bool:
|
|
"""Light check — verifies credentials refresh without making API calls."""
|
|
try:
|
|
creds = service_account.Credentials.from_service_account_info(
|
|
self._key_dict, scopes=GMAIL_SCOPES
|
|
)
|
|
return bool(creds)
|
|
except Exception:
|
|
return False
|
|
|
|
def get_service_account_email(self) -> str:
|
|
return self._key_dict.get("client_email", "")
|
|
|
|
def get_project_id(self) -> str:
|
|
return self._key_dict.get("project_id", "")
|
|
|
|
# ── User listing ─────────────────────────────────────────────────────────
|
|
|
|
def list_users(self, domain: str = "") -> list[dict]:
|
|
"""
|
|
Return [{id, email, displayName}] for all active users in the domain.
|
|
Requires Admin Directory API scope on the service account delegation.
|
|
Falls back gracefully if admin_email is not set.
|
|
"""
|
|
if not self._admin_email:
|
|
return []
|
|
try:
|
|
creds = self._admin_creds()
|
|
service = build("admin", "directory_v1", credentials=creds, cache_discovery=False)
|
|
results = []
|
|
page_token = None
|
|
params: dict = {"customer": "my_customer", "maxResults": 500, "orderBy": "email", "projection": "full"}
|
|
if domain:
|
|
params["domain"] = domain
|
|
while True:
|
|
if page_token:
|
|
params["pageToken"] = page_token
|
|
resp = service.users().list(**params).execute()
|
|
for u in resp.get("users", []):
|
|
if not u.get("suspended") and not u.get("archived"):
|
|
ou_path = u.get("orgUnitPath", "")
|
|
results.append({
|
|
"id": u.get("id", ""),
|
|
"email": u.get("primaryEmail", ""),
|
|
"displayName": u.get("name", {}).get("fullName", ""),
|
|
"orgUnitPath": ou_path,
|
|
"userRole": classify_ou_role(ou_path),
|
|
})
|
|
page_token = resp.get("nextPageToken")
|
|
if not page_token:
|
|
break
|
|
return results
|
|
except HttpError as e:
|
|
raise GoogleError(f"Admin Directory API error: {e}") from e
|
|
|
|
# ── Gmail iterator ────────────────────────────────────────────────────────
|
|
|
|
def iter_gmail_messages(
|
|
self,
|
|
user_email: str,
|
|
max_messages: int = 2000,
|
|
scan_body: bool = True,
|
|
scan_attachments: bool = True,
|
|
max_attach_mb: float = 20.0,
|
|
) -> Iterator[tuple[dict, bytes]]:
|
|
"""
|
|
Yield (metadata, content_bytes) for each Gmail message / attachment.
|
|
|
|
For messages with only inline text body: yields one item with the body text.
|
|
For attachments: yields one item per attachment (skips if > max_attach_mb).
|
|
"""
|
|
try:
|
|
creds = self._creds_for(user_email, GMAIL_SCOPES)
|
|
service = build("gmail", "v1", credentials=creds, cache_discovery=False)
|
|
except HttpError as e:
|
|
raise GoogleError(f"Gmail auth failed for {user_email}: {e}") from e
|
|
yield from _gmail_iter(service, user_email, max_messages, scan_body, scan_attachments, max_attach_mb)
|
|
|
|
# ── Drive iterator ────────────────────────────────────────────────────────
|
|
|
|
def iter_drive_files(
|
|
self,
|
|
user_email: str,
|
|
max_files: int = 5000,
|
|
max_file_mb: float = 50.0,
|
|
) -> Iterator[tuple[dict, bytes]]:
|
|
"""
|
|
Yield (metadata, content_bytes) for each Drive file.
|
|
|
|
Native Google formats (Docs/Sheets/Slides) are exported to Office format.
|
|
Binary files are downloaded directly (skipped if > max_file_mb).
|
|
"""
|
|
try:
|
|
creds = self._creds_for(user_email, DRIVE_SCOPES)
|
|
service = build("drive", "v3", credentials=creds, cache_discovery=False)
|
|
except HttpError as e:
|
|
raise GoogleError(f"Drive auth failed for {user_email}: {e}") from e
|
|
yield from _drive_iter(service, user_email, max_files, max_file_mb)
|
|
|
|
def get_drive_start_token(self, user_email: str) -> str:
|
|
"""Return the current Changes API start page token for user's Drive."""
|
|
try:
|
|
creds = self._creds_for(user_email, DRIVE_SCOPES)
|
|
service = build("drive", "v3", credentials=creds, cache_discovery=False)
|
|
except HttpError as e:
|
|
raise GoogleError(f"Drive auth failed for {user_email}: {e}") from e
|
|
return _drive_get_start_page_token(service)
|
|
|
|
def get_drive_changes(
|
|
self,
|
|
user_email: str,
|
|
page_token: str,
|
|
max_files: int = 5000,
|
|
max_file_mb: float = 50.0,
|
|
) -> "tuple[list[tuple[dict, bytes]], str]":
|
|
"""Return (changed_files, new_page_token) since page_token."""
|
|
try:
|
|
creds = self._creds_for(user_email, DRIVE_SCOPES)
|
|
service = build("drive", "v3", credentials=creds, cache_discovery=False)
|
|
except HttpError as e:
|
|
raise GoogleError(f"Drive auth failed for {user_email}: {e}") from e
|
|
return _drive_changes_collect(service, user_email, page_token, max_files, max_file_mb)
|
|
|
|
# ── Drive write-back (redaction) ──────────────────────────────────────────
|
|
|
|
def get_drive_file_mime(self, user_email: str, file_id: str) -> str:
|
|
"""Return the mimeType of a Drive file."""
|
|
creds = self._creds_for(user_email, DRIVE_WRITE_SCOPES)
|
|
service = build("drive", "v3", credentials=creds, cache_discovery=False)
|
|
return _get_drive_file_mime(service, file_id)
|
|
|
|
def download_drive_file_by_id(self, user_email: str, file_id: str) -> bytes:
|
|
"""Download raw bytes of a non-Google-native Drive file by ID."""
|
|
creds = self._creds_for(user_email, DRIVE_WRITE_SCOPES)
|
|
service = build("drive", "v3", credentials=creds, cache_discovery=False)
|
|
return _download_drive_file_by_id(service, file_id)
|
|
|
|
def update_drive_file(self, user_email: str, file_id: str, content: bytes, mime_type: str) -> None:
|
|
"""Replace Drive file content in-place. Requires drive (not drive.readonly) scope."""
|
|
creds = self._creds_for(user_email, DRIVE_WRITE_SCOPES)
|
|
service = build("drive", "v3", credentials=creds, cache_discovery=False)
|
|
_update_drive_file_content(service, file_id, content, mime_type)
|
|
|
|
|
|
# ── Persistence helpers ───────────────────────────────────────────────────────
|
|
|
|
def load_saved_key() -> Optional[dict]:
|
|
"""Load service account key from disk. Returns None if not found."""
|
|
if _SA_KEY_FILE.exists():
|
|
try:
|
|
return json.loads(_SA_KEY_FILE.read_text())
|
|
except Exception:
|
|
return None
|
|
return None
|
|
|
|
|
|
def save_key(key_dict: dict) -> None:
|
|
"""Persist service account key to disk (chmod 600)."""
|
|
_SA_KEY_FILE.write_text(json.dumps(key_dict, indent=2))
|
|
try:
|
|
_SA_KEY_FILE.chmod(0o600)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def delete_key() -> None:
|
|
"""Remove persisted service account key."""
|
|
try:
|
|
if _SA_KEY_FILE.exists():
|
|
_SA_KEY_FILE.unlink()
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
# ── Internal helpers ──────────────────────────────────────────────────────────
|
|
|
|
def _epoch_to_iso(epoch_secs: int) -> str:
|
|
from datetime import datetime, timezone
|
|
try:
|
|
return datetime.fromtimestamp(epoch_secs, tz=timezone.utc).isoformat()
|
|
except Exception:
|
|
return ""
|
|
|
|
|
|
def _extract_body(payload: dict) -> bytes:
|
|
"""Recursively extract plain-text (or HTML) body from a Gmail message payload."""
|
|
mime = payload.get("mimeType", "")
|
|
body_data = payload.get("body", {}).get("data", "")
|
|
|
|
if mime == "text/plain" and body_data:
|
|
return base64.urlsafe_b64decode(body_data)
|
|
if mime == "text/html" and body_data:
|
|
# Return raw HTML bytes — _scan_bytes handles HTML stripping
|
|
return base64.urlsafe_b64decode(body_data)
|
|
|
|
# Recurse into multipart
|
|
for part in payload.get("parts", []):
|
|
result = _extract_body(part)
|
|
if result:
|
|
return result
|
|
return b""
|
|
|
|
|
|
def _iter_parts(payload: dict):
|
|
"""Yield all leaf parts (for attachment scanning)."""
|
|
parts = payload.get("parts", [])
|
|
if not parts:
|
|
yield payload
|
|
else:
|
|
for part in parts:
|
|
yield from _iter_parts(part)
|
|
|
|
|
|
# ── Shared iteration helpers (used by both GoogleConnector and PersonalGoogleConnector) ──
|
|
|
|
def _gmail_iter(
|
|
service,
|
|
user_email: str,
|
|
max_messages: int,
|
|
scan_body: bool,
|
|
scan_attachments: bool,
|
|
max_attach_mb: float,
|
|
) -> Iterator[tuple[dict, bytes]]:
|
|
"""Paginate Gmail messages and yield (metadata, bytes) tuples."""
|
|
ids: list[str] = []
|
|
page_token = None
|
|
while len(ids) < max_messages:
|
|
params: dict = {"userId": "me", "maxResults": min(500, max_messages - len(ids))}
|
|
if page_token:
|
|
params["pageToken"] = page_token
|
|
try:
|
|
resp = service.users().messages().list(**params).execute()
|
|
except HttpError as e:
|
|
raise GoogleError(f"Gmail list error for {user_email}: {e}") from e
|
|
ids.extend(m["id"] for m in resp.get("messages", []))
|
|
page_token = resp.get("nextPageToken")
|
|
if not page_token:
|
|
break
|
|
|
|
max_attach_bytes = int(max_attach_mb * 1024 * 1024)
|
|
|
|
for msg_id in ids:
|
|
try:
|
|
msg = service.users().messages().get(
|
|
userId="me", id=msg_id, format="full"
|
|
).execute()
|
|
except HttpError:
|
|
continue
|
|
|
|
headers = {h["name"].lower(): h["value"] for h in msg.get("payload", {}).get("headers", [])}
|
|
meta = {
|
|
"id": f"gmail:{msg_id}",
|
|
"name": headers.get("subject", "(no subject)"),
|
|
"_source": "gmail",
|
|
"_source_type": "gmail",
|
|
"_account": user_email,
|
|
"_account_id": user_email,
|
|
"_url": f"https://mail.google.com/mail/u/0/#inbox/{msg_id}",
|
|
"receivedDateTime": _epoch_to_iso(int(msg.get("internalDate", 0)) // 1000),
|
|
"size": msg.get("sizeEstimate", 0),
|
|
}
|
|
|
|
payload = msg.get("payload", {})
|
|
|
|
if scan_body:
|
|
body_bytes = _extract_body(payload)
|
|
if body_bytes:
|
|
yield (meta, body_bytes)
|
|
|
|
if scan_attachments:
|
|
for part in _iter_parts(payload):
|
|
filename = part.get("filename", "")
|
|
body = part.get("body", {})
|
|
att_id = body.get("attachmentId")
|
|
size = body.get("size", 0)
|
|
if not att_id or not filename:
|
|
continue
|
|
if size > max_attach_bytes:
|
|
continue
|
|
try:
|
|
att = service.users().messages().attachments().get(
|
|
userId="me", messageId=msg_id, id=att_id
|
|
).execute()
|
|
data = base64.urlsafe_b64decode(att.get("data", ""))
|
|
except HttpError:
|
|
continue
|
|
att_meta = {
|
|
**meta,
|
|
"id": f"gmail:{msg_id}:{att_id}",
|
|
"name": filename,
|
|
"size": len(data),
|
|
}
|
|
yield (att_meta, data)
|
|
|
|
|
|
def _download_drive_file(
|
|
service,
|
|
f: dict,
|
|
user_email: str,
|
|
max_bytes: int,
|
|
) -> "tuple[dict, bytes] | None":
|
|
"""Download one Drive file entry. Returns (meta, data) or None if skipped."""
|
|
mime = f.get("mimeType", "")
|
|
fid = f.get("id", "")
|
|
fname = f.get("name", "")
|
|
size = int(f.get("size", 0) or 0)
|
|
|
|
meta = {
|
|
"id": f"gdrive:{fid}",
|
|
"name": fname,
|
|
"_source": "gdrive",
|
|
"_source_type": "gdrive",
|
|
"_account": user_email,
|
|
"_account_id": user_email,
|
|
"_url": f.get("webViewLink", ""),
|
|
"lastModifiedDateTime": f.get("modifiedTime", "")[:10],
|
|
"size": size,
|
|
}
|
|
|
|
if mime in _EXPORT_MAP:
|
|
export_mime, ext = _EXPORT_MAP[mime]
|
|
try:
|
|
req = service.files().export_media(fileId=fid, mimeType=export_mime)
|
|
buf = io.BytesIO()
|
|
dl = MediaIoBaseDownload(buf, req, chunksize=4 * 1024 * 1024)
|
|
done = False
|
|
total = 0
|
|
while not done:
|
|
_, done = dl.next_chunk()
|
|
total = buf.tell()
|
|
if total > _MAX_EXPORT_BYTES:
|
|
break
|
|
if total > _MAX_EXPORT_BYTES:
|
|
return None
|
|
meta["name"] = fname + ext
|
|
meta["size"] = total
|
|
data = buf.getvalue()
|
|
del buf
|
|
return (meta, data)
|
|
except HttpError as e:
|
|
if "exportSizeLimitExceeded" in str(e):
|
|
print(
|
|
f"[gdrive] skip '{fname}' — file too large for Google export API"
|
|
f" (exportSizeLimitExceeded); fid={fid}",
|
|
flush=True,
|
|
)
|
|
return None
|
|
else:
|
|
if mime.startswith("application/vnd.google-apps."):
|
|
return None
|
|
if size == 0 or size > max_bytes:
|
|
return None
|
|
try:
|
|
req = service.files().get_media(fileId=fid)
|
|
buf = io.BytesIO()
|
|
dl = MediaIoBaseDownload(buf, req, chunksize=4 * 1024 * 1024)
|
|
done = False
|
|
while not done:
|
|
_, done = dl.next_chunk()
|
|
data = buf.getvalue()
|
|
del buf
|
|
return (meta, data)
|
|
except HttpError:
|
|
return None
|
|
|
|
|
|
def _get_drive_file_mime(service, file_id: str) -> str:
|
|
"""Return the mimeType of a Drive file."""
|
|
info = service.files().get(fileId=file_id, fields="mimeType").execute()
|
|
return info.get("mimeType", "")
|
|
|
|
|
|
def _download_drive_file_by_id(service, file_id: str) -> bytes:
|
|
"""Download raw bytes of a non-Google-native Drive file by ID."""
|
|
req = service.files().get_media(fileId=file_id)
|
|
buf = io.BytesIO()
|
|
dl = MediaIoBaseDownload(buf, req, chunksize=4 * 1024 * 1024)
|
|
done = False
|
|
while not done:
|
|
_, done = dl.next_chunk()
|
|
return buf.getvalue()
|
|
|
|
|
|
def _update_drive_file_content(service, file_id: str, content: bytes, mime_type: str) -> None:
|
|
"""Replace a Drive file's content in-place."""
|
|
from googleapiclient.http import MediaInMemoryUpload
|
|
media = MediaInMemoryUpload(content, mimetype=mime_type, resumable=False)
|
|
service.files().update(fileId=file_id, media_body=media).execute()
|
|
|
|
|
|
def _drive_iter(
|
|
service,
|
|
user_email: str,
|
|
max_files: int,
|
|
max_file_mb: float,
|
|
) -> Iterator[tuple[dict, bytes]]:
|
|
"""Paginate Drive files and yield (metadata, bytes) tuples."""
|
|
max_bytes = int(max_file_mb * 1024 * 1024)
|
|
fields = "nextPageToken,files(id,name,mimeType,size,webViewLink,modifiedTime,owners,parents)"
|
|
page_token = None
|
|
fetched = 0
|
|
|
|
while fetched < max_files:
|
|
params: dict = {
|
|
"pageSize": min(1000, max_files - fetched),
|
|
"fields": fields,
|
|
"q": "trashed = false",
|
|
}
|
|
if page_token:
|
|
params["pageToken"] = page_token
|
|
try:
|
|
resp = service.files().list(**params).execute()
|
|
except HttpError as e:
|
|
raise GoogleError(f"Drive list error for {user_email}: {e}") from e
|
|
|
|
for f in resp.get("files", []):
|
|
fetched += 1
|
|
result = _download_drive_file(service, f, user_email, max_bytes)
|
|
if result:
|
|
yield result
|
|
|
|
page_token = resp.get("nextPageToken")
|
|
if not page_token:
|
|
break
|
|
|
|
|
|
def _drive_get_start_page_token(service) -> str:
|
|
"""Return the current Changes API start page token for this Drive."""
|
|
resp = service.changes().getStartPageToken().execute()
|
|
return resp["startPageToken"]
|
|
|
|
|
|
def _drive_changes_collect(
|
|
service,
|
|
user_email: str,
|
|
page_token: str,
|
|
max_files: int,
|
|
max_file_mb: float,
|
|
) -> "tuple[list[tuple[dict, bytes]], str]":
|
|
"""
|
|
Collect Drive changes since page_token using the Changes API.
|
|
Returns (list_of_(meta, data)_tuples, new_start_page_token).
|
|
Skips removed/trashed files.
|
|
Raises GoogleError on API failure so the caller can fall back to a full scan.
|
|
"""
|
|
max_bytes = int(max_file_mb * 1024 * 1024)
|
|
fields = (
|
|
"nextPageToken,newStartPageToken,"
|
|
"changes(removed,file(id,name,mimeType,size,webViewLink,modifiedTime,owners,parents))"
|
|
)
|
|
results: list = []
|
|
new_token = page_token
|
|
fetched = 0
|
|
|
|
while fetched < max_files:
|
|
params: dict = {
|
|
"pageToken": page_token,
|
|
"spaces": "drive",
|
|
"fields": fields,
|
|
"includeRemoved": True,
|
|
"pageSize": min(1000, max_files - fetched),
|
|
}
|
|
try:
|
|
resp = service.changes().list(**params).execute()
|
|
except HttpError as e:
|
|
raise GoogleError(f"Drive changes error for {user_email}: {e}") from e
|
|
|
|
for change in resp.get("changes", []):
|
|
if change.get("removed"):
|
|
continue
|
|
f = change.get("file")
|
|
if not f:
|
|
continue
|
|
fetched += 1
|
|
result = _download_drive_file(service, f, user_email, max_bytes)
|
|
if result:
|
|
results.append(result)
|
|
|
|
if "newStartPageToken" in resp:
|
|
new_token = resp["newStartPageToken"]
|
|
break
|
|
page_token = resp.get("nextPageToken")
|
|
if not page_token:
|
|
break
|
|
|
|
return results, new_token
|
|
|
|
|
|
# ── Personal Google account (OAuth device-code) connector ────────────────────
|
|
|
|
class PersonalGoogleConnector:
|
|
"""
|
|
OAuth 2.0 device-code connector for personal Google accounts.
|
|
|
|
Provides the same public interface as GoogleConnector so the scan engine
|
|
can use either transparently via state.google_connector.
|
|
|
|
Authentication:
|
|
GCP project with an OAuth 2.0 Desktop App credential.
|
|
Required scopes: gmail.readonly, drive.readonly.
|
|
"""
|
|
|
|
def __init__(self, token_data: dict):
|
|
"""
|
|
Construct from a stored token dict with keys:
|
|
access_token, refresh_token, client_id, client_secret, token_uri, scopes
|
|
"""
|
|
if not GOOGLE_AUTH_OK:
|
|
raise GoogleError(
|
|
"google-auth not installed — run: "
|
|
"pip install google-auth google-auth-httplib2 google-api-python-client"
|
|
)
|
|
self._token_data = token_data
|
|
self._creds = self._build_creds()
|
|
|
|
def _build_creds(self):
|
|
from google.oauth2.credentials import Credentials
|
|
return Credentials(
|
|
token=self._token_data.get("access_token"),
|
|
refresh_token=self._token_data.get("refresh_token"),
|
|
token_uri=self._token_data.get("token_uri", _TOKEN_URL),
|
|
client_id=self._token_data.get("client_id"),
|
|
client_secret=self._token_data.get("client_secret"),
|
|
scopes=self._token_data.get("scopes", PERSONAL_SCOPES),
|
|
)
|
|
|
|
def _refresh_if_needed(self) -> None:
|
|
from google.auth.transport.requests import Request
|
|
if not self._creds.valid:
|
|
if self._creds.expired and self._creds.refresh_token:
|
|
self._creds.refresh(Request())
|
|
updated = dict(self._token_data)
|
|
updated["access_token"] = self._creds.token
|
|
save_personal_token(updated)
|
|
self._token_data = updated
|
|
|
|
def is_authenticated(self) -> bool:
|
|
try:
|
|
self._refresh_if_needed()
|
|
return bool(self._creds.token)
|
|
except Exception:
|
|
return False
|
|
|
|
def get_user_info(self) -> dict:
|
|
"""Return {id, email, displayName} for the authenticated user."""
|
|
if not REQUESTS_OK:
|
|
raise GoogleError("requests library required")
|
|
self._refresh_if_needed()
|
|
resp = _requests.get(
|
|
_USERINFO_URL,
|
|
headers={"Authorization": f"Bearer {self._creds.token}"},
|
|
timeout=10,
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
return {
|
|
"id": data.get("id", ""),
|
|
"email": data.get("email", ""),
|
|
"displayName": data.get("name", ""),
|
|
}
|
|
|
|
def list_users(self, domain: str = "") -> list[dict]:
|
|
"""Return a single-item list for the signed-in user (no admin access needed)."""
|
|
info = self.get_user_info()
|
|
return [{
|
|
"id": info["email"],
|
|
"email": info["email"],
|
|
"displayName": info["displayName"],
|
|
"orgUnitPath": "",
|
|
"userRole": "other",
|
|
}]
|
|
|
|
def iter_gmail_messages(
|
|
self,
|
|
user_email: str,
|
|
max_messages: int = 2000,
|
|
scan_body: bool = True,
|
|
scan_attachments: bool = True,
|
|
max_attach_mb: float = 20.0,
|
|
) -> Iterator[tuple[dict, bytes]]:
|
|
"""Yield (metadata, bytes) for each Gmail message / attachment."""
|
|
self._refresh_if_needed()
|
|
try:
|
|
service = build("gmail", "v1", credentials=self._creds, cache_discovery=False)
|
|
except HttpError as e:
|
|
raise GoogleError(f"Gmail auth failed: {e}") from e
|
|
yield from _gmail_iter(service, user_email, max_messages, scan_body, scan_attachments, max_attach_mb)
|
|
|
|
def iter_drive_files(
|
|
self,
|
|
user_email: str,
|
|
max_files: int = 5000,
|
|
max_file_mb: float = 50.0,
|
|
) -> Iterator[tuple[dict, bytes]]:
|
|
"""Yield (metadata, bytes) for each Drive file."""
|
|
self._refresh_if_needed()
|
|
try:
|
|
service = build("drive", "v3", credentials=self._creds, cache_discovery=False)
|
|
except HttpError as e:
|
|
raise GoogleError(f"Drive auth failed: {e}") from e
|
|
yield from _drive_iter(service, user_email, max_files, max_file_mb)
|
|
|
|
def get_drive_start_token(self, user_email: str) -> str:
|
|
"""Return the current Changes API start page token for this Drive."""
|
|
self._refresh_if_needed()
|
|
try:
|
|
service = build("drive", "v3", credentials=self._creds, cache_discovery=False)
|
|
except HttpError as e:
|
|
raise GoogleError(f"Drive auth failed: {e}") from e
|
|
return _drive_get_start_page_token(service)
|
|
|
|
def get_drive_changes(
|
|
self,
|
|
user_email: str,
|
|
page_token: str,
|
|
max_files: int = 5000,
|
|
max_file_mb: float = 50.0,
|
|
) -> "tuple[list[tuple[dict, bytes]], str]":
|
|
"""Return (changed_files, new_page_token) since page_token."""
|
|
self._refresh_if_needed()
|
|
try:
|
|
service = build("drive", "v3", credentials=self._creds, cache_discovery=False)
|
|
except HttpError as e:
|
|
raise GoogleError(f"Drive auth failed: {e}") from e
|
|
return _drive_changes_collect(service, user_email, page_token, max_files, max_file_mb)
|
|
|
|
# ── Drive write-back (redaction) ──────────────────────────────────────────
|
|
|
|
def get_drive_file_mime(self, user_email: str, file_id: str) -> str:
|
|
"""Return the mimeType of a Drive file."""
|
|
self._refresh_if_needed()
|
|
service = build("drive", "v3", credentials=self._creds, cache_discovery=False)
|
|
return _get_drive_file_mime(service, file_id)
|
|
|
|
def download_drive_file_by_id(self, user_email: str, file_id: str) -> bytes:
|
|
"""Download raw bytes of a non-Google-native Drive file by ID."""
|
|
self._refresh_if_needed()
|
|
service = build("drive", "v3", credentials=self._creds, cache_discovery=False)
|
|
return _download_drive_file_by_id(service, file_id)
|
|
|
|
def update_drive_file(self, user_email: str, file_id: str, content: bytes, mime_type: str) -> None:
|
|
"""Replace Drive file content in-place. Requires drive (not drive.readonly) scope."""
|
|
self._refresh_if_needed()
|
|
service = build("drive", "v3", credentials=self._creds, cache_discovery=False)
|
|
_update_drive_file_content(service, file_id, content, mime_type)
|
|
|
|
@staticmethod
|
|
def get_device_code_flow(client_id: str, client_secret: str) -> dict:
|
|
"""
|
|
Initiate a Google device-code flow.
|
|
Returns a flow dict containing user_code, verification_url, device_code, etc.
|
|
"""
|
|
if not REQUESTS_OK:
|
|
raise GoogleError("requests library required — run: pip install requests")
|
|
resp = _requests.post(_DEVICE_AUTH_URL, data={
|
|
"client_id": client_id,
|
|
"scope": " ".join(PERSONAL_SCOPES),
|
|
}, timeout=10)
|
|
data = resp.json()
|
|
if "device_code" not in data:
|
|
raise GoogleError(
|
|
f"Failed to start device flow: {data.get('error_description', data)}"
|
|
)
|
|
return {
|
|
"device_code": data["device_code"],
|
|
"user_code": data["user_code"],
|
|
"verification_url": data.get("verification_url", "https://www.google.com/device"),
|
|
"expires_in": data.get("expires_in", 1800),
|
|
"interval": data.get("interval", 5),
|
|
"client_id": client_id,
|
|
"client_secret": client_secret,
|
|
}
|
|
|
|
@staticmethod
|
|
def complete_device_code_flow(flow: dict) -> "PersonalGoogleConnector":
|
|
"""
|
|
Poll until the user completes sign-in at verification_url.
|
|
Blocks the calling thread. Returns a ready PersonalGoogleConnector.
|
|
"""
|
|
if not REQUESTS_OK:
|
|
raise GoogleError("requests library required — run: pip install requests")
|
|
client_id = flow["client_id"]
|
|
client_secret = flow["client_secret"]
|
|
device_code = flow["device_code"]
|
|
interval = flow.get("interval", 5)
|
|
expires_in = flow.get("expires_in", 1800)
|
|
deadline = time.time() + expires_in
|
|
|
|
while time.time() < deadline:
|
|
time.sleep(interval)
|
|
resp = _requests.post(_TOKEN_URL, data={
|
|
"client_id": client_id,
|
|
"client_secret": client_secret,
|
|
"device_code": device_code,
|
|
"grant_type": _DEVICE_GRANT,
|
|
}, timeout=10)
|
|
data = resp.json()
|
|
if "access_token" in data:
|
|
token_data = {
|
|
"access_token": data["access_token"],
|
|
"refresh_token": data.get("refresh_token", ""),
|
|
"client_id": client_id,
|
|
"client_secret": client_secret,
|
|
"token_uri": _TOKEN_URL,
|
|
"scopes": PERSONAL_SCOPES,
|
|
}
|
|
save_personal_token(token_data)
|
|
return PersonalGoogleConnector(token_data)
|
|
err = data.get("error", "")
|
|
if err == "authorization_pending":
|
|
continue
|
|
if err == "slow_down":
|
|
interval = max(interval + 5, 5)
|
|
continue
|
|
raise GoogleError(
|
|
f"Device flow error: {data.get('error_description', err)}"
|
|
)
|
|
|
|
raise GoogleError("Device code flow timed out")
|
|
|
|
|
|
# ── Personal token persistence ────────────────────────────────────────────────
|
|
|
|
def save_personal_token(data: dict) -> None:
|
|
"""Persist OAuth token to disk (chmod 600)."""
|
|
_GOOGLE_TOKEN_FILE.write_text(json.dumps(data, indent=2))
|
|
try:
|
|
_GOOGLE_TOKEN_FILE.chmod(0o600)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def load_personal_token() -> Optional[dict]:
|
|
"""Load OAuth token from disk. Returns None if not found."""
|
|
if _GOOGLE_TOKEN_FILE.exists():
|
|
try:
|
|
return json.loads(_GOOGLE_TOKEN_FILE.read_text())
|
|
except Exception:
|
|
return None
|
|
return None
|
|
|
|
|
|
def delete_personal_token() -> None:
|
|
"""Remove persisted OAuth token."""
|
|
try:
|
|
if _GOOGLE_TOKEN_FILE.exists():
|
|
_GOOGLE_TOKEN_FILE.unlink()
|
|
except Exception:
|
|
pass
|