refactor(deploy): rewrite left4me-script-sandbox to systemd-only — drop bwrap
Replaces the systemd-run --scope + bwrap composition with systemd-run in
service-unit mode (--pipe --wait, transient .service unit). Same cgroup
limits and walltime kill, plus the hardening directives that --scope
units cannot carry: NoNewPrivileges, ProtectSystem=strict, ProtectHome,
ProtectKernel{Tunables,Modules,Logs,ControlGroups}, RestrictNamespaces,
RestrictAddressFamilies, RestrictSUIDSGID, LockPersonality,
MemoryDenyWriteExecute, SystemCallFilter (seccomp), and an empty
CapabilityBoundingSet (drops all caps). UID drop via User=/Group=.
The TemporaryFileSystem="/etc /var/lib" pair is the gotcha:
ProtectSystem=strict makes /var/lib *read-only* but visible, so the host
DB at /var/lib/left4me/left4me.db (mode 0644) was readable from inside.
Masking /var/lib with tmpfs hides the entire subtree; the BindPaths bind
to /overlay is at a different path and unaffected.
The Python side (ScriptBuilder, run_sandboxed_script, routes) is
unchanged — same sudo-helper invocation, same argv shape.
Loses PID-namespace isolation (no PrivatePID= directive in systemd).
Host PIDs are visible via /proc and ps -ef but not signal-able due to
UID mismatch — information disclosure only, not a privilege boundary.
Smoke-tested on ckn@10.0.4.128 prior to this commit; all isolation
invariants reproduced and the hardening directives provably blocked
unshare(2), mount(2), personality(2), bpf(2), and sysctl writes.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
efaaf84cd9
commit
4ee8f6af44
2 changed files with 88 additions and 41 deletions
|
|
@ -7,12 +7,16 @@
|
||||||
# <script_path> absolute path to a bash file already written by the web app;
|
# <script_path> absolute path to a bash file already written by the web app;
|
||||||
# bind-mounted read-only at /script.sh inside the sandbox.
|
# bind-mounted read-only at /script.sh inside the sandbox.
|
||||||
#
|
#
|
||||||
# The script runs under bubblewrap inside a transient systemd scope so we get
|
# The script runs as a transient systemd .service with the full hardening
|
||||||
# cgroup-v2 limits (memory / tasks / cpu) and a wallclock kill via
|
# surface: cgroup limits + walltime kill, NoNewPrivileges, ProtectSystem,
|
||||||
# RuntimeMaxSec. The sandbox drops to the unprivileged l4d2-sandbox UID;
|
# ProtectHome, kernel-tunable / -module / -log protection, namespace
|
||||||
# host filesystems are exposed read-only except /overlay (rw) and tmpfs
|
# restriction, address-family restriction, capability bounding (empty),
|
||||||
# /tmp + /run. Network namespace is *not* unshared — scripts must reach the
|
# seccomp filter (@system-service @network-io), MemoryDenyWriteExecute,
|
||||||
# public internet to download workshop / l4d2center / cedapug content.
|
# LockPersonality, RestrictSUIDSGID. Network namespace is *not* restricted —
|
||||||
|
# scripts must reach the public internet to download workshop / l4d2center
|
||||||
|
# / cedapug content. PID namespace is shared with the host (no
|
||||||
|
# PrivatePID= directive in systemd); host PIDs are visible via /proc but
|
||||||
|
# not signal-able due to UID mismatch.
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
|
|
||||||
[[ $# -eq 2 ]] || { echo "usage: $0 <overlay_id> <script>" >&2; exit 64; }
|
[[ $# -eq 2 ]] || { echo "usage: $0 <overlay_id> <script>" >&2; exit 64; }
|
||||||
|
|
@ -33,33 +37,31 @@ fi
|
||||||
# Make sure the sandbox UID owns the overlay dir so the script can write there.
|
# Make sure the sandbox UID owns the overlay dir so the script can write there.
|
||||||
# Idempotent: a no-op when the dir is already l4d2-sandbox-owned (re-run case),
|
# Idempotent: a no-op when the dir is already l4d2-sandbox-owned (re-run case),
|
||||||
# and corrects the ownership the first time the dir was created by the web app
|
# and corrects the ownership the first time the dir was created by the web app
|
||||||
# under the left4me UID. Group-readable so the gameserver process (left4me)
|
# under the left4me UID. World-readable so the gameserver process (left4me)
|
||||||
# can read the overlay contents via the kernel-overlayfs lowerdir at runtime.
|
# can read the overlay contents via the kernel-overlayfs lowerdir at runtime.
|
||||||
chown -R l4d2-sandbox:l4d2-sandbox "$OVERLAY_DIR"
|
chown -R l4d2-sandbox:l4d2-sandbox "$OVERLAY_DIR"
|
||||||
chmod 0755 "$OVERLAY_DIR"
|
chmod 0755 "$OVERLAY_DIR"
|
||||||
|
|
||||||
# UID/GID drop happens via systemd-run --uid/--gid before bwrap is invoked.
|
exec systemd-run --quiet --collect --wait --pipe \
|
||||||
# bwrap then runs unprivileged as l4d2-sandbox; --unshare-user-try gives it
|
--unit="left4me-script-${OVERLAY_ID}-$$" \
|
||||||
# the user-namespace context it needs for bind-mounts as a regular user.
|
-p User=l4d2-sandbox -p Group=l4d2-sandbox \
|
||||||
exec systemd-run --quiet --scope --collect \
|
-p NoNewPrivileges=yes \
|
||||||
--uid=l4d2-sandbox --gid=l4d2-sandbox \
|
-p ProtectSystem=strict -p ProtectHome=yes \
|
||||||
|
-p PrivateTmp=yes -p PrivateDevices=yes -p PrivateIPC=yes \
|
||||||
|
-p ProtectKernelTunables=yes -p ProtectKernelModules=yes \
|
||||||
|
-p ProtectKernelLogs=yes -p ProtectControlGroups=yes \
|
||||||
|
-p RestrictNamespaces=yes \
|
||||||
|
-p RestrictAddressFamilies="AF_INET AF_INET6 AF_UNIX" \
|
||||||
|
-p RestrictSUIDSGID=yes -p LockPersonality=yes \
|
||||||
|
-p MemoryDenyWriteExecute=yes \
|
||||||
|
-p SystemCallFilter="@system-service @network-io" \
|
||||||
|
-p SystemCallArchitectures=native \
|
||||||
|
-p CapabilityBoundingSet= -p AmbientCapabilities= \
|
||||||
|
-p TemporaryFileSystem="/etc /var/lib" \
|
||||||
|
-p BindReadOnlyPaths="/etc/resolv.conf /etc/ssl /etc/ca-certificates /etc/nsswitch.conf /etc/alternatives ${SCRIPT}:/script.sh" \
|
||||||
|
-p BindPaths="${OVERLAY_DIR}:/overlay" \
|
||||||
|
-p WorkingDirectory=/overlay \
|
||||||
|
-p Environment="HOME=/tmp PATH=/usr/bin:/usr/sbin OVERLAY=/overlay" \
|
||||||
-p MemoryMax=4G -p MemorySwapMax=0 -p TasksMax=512 \
|
-p MemoryMax=4G -p MemorySwapMax=0 -p TasksMax=512 \
|
||||||
-p CPUQuota=200% -p RuntimeMaxSec=3600 \
|
-p CPUQuota=200% -p RuntimeMaxSec=3600 \
|
||||||
-- bwrap \
|
-- /bin/bash /script.sh
|
||||||
--die-with-parent --new-session \
|
|
||||||
--unshare-user-try \
|
|
||||||
--unshare-pid --unshare-ipc --unshare-uts --unshare-cgroup \
|
|
||||||
--proc /proc --dev /dev --tmpfs /tmp --tmpfs /run \
|
|
||||||
--ro-bind /usr /usr --ro-bind /lib /lib --ro-bind /lib64 /lib64 \
|
|
||||||
--symlink usr/bin /bin --symlink usr/sbin /sbin \
|
|
||||||
--ro-bind /etc/resolv.conf /etc/resolv.conf \
|
|
||||||
--ro-bind /etc/ssl /etc/ssl \
|
|
||||||
--ro-bind /etc/ca-certificates /etc/ca-certificates \
|
|
||||||
--ro-bind /etc/nsswitch.conf /etc/nsswitch.conf \
|
|
||||||
--ro-bind /etc/alternatives /etc/alternatives \
|
|
||||||
--bind "$OVERLAY_DIR" /overlay \
|
|
||||||
--chdir /overlay \
|
|
||||||
--setenv HOME /tmp --setenv PATH /usr/bin:/usr/sbin \
|
|
||||||
--setenv OVERLAY /overlay \
|
|
||||||
--ro-bind "$SCRIPT" /script.sh \
|
|
||||||
/bin/bash /script.sh
|
|
||||||
|
|
|
||||||
|
|
@ -321,22 +321,67 @@ def test_script_sandbox_helper_passes_shell_syntax_check():
|
||||||
subprocess.run(["bash", "-n", str(SCRIPT_SANDBOX_HELPER)], check=True)
|
subprocess.run(["bash", "-n", str(SCRIPT_SANDBOX_HELPER)], check=True)
|
||||||
|
|
||||||
|
|
||||||
def test_script_sandbox_helper_invokes_systemd_run_and_bwrap():
|
def test_script_sandbox_helper_invokes_systemd_run_with_hardening():
|
||||||
text = SCRIPT_SANDBOX_HELPER.read_text()
|
text = SCRIPT_SANDBOX_HELPER.read_text()
|
||||||
|
|
||||||
|
# systemd-run service mode (no --scope), with synchronous I/O to caller.
|
||||||
assert "systemd-run" in text
|
assert "systemd-run" in text
|
||||||
assert "--scope" in text
|
assert "--scope" not in text, "v2 uses transient service units, not scopes"
|
||||||
|
assert "--pipe" in text
|
||||||
|
assert "--wait" in text
|
||||||
assert "--collect" in text
|
assert "--collect" in text
|
||||||
|
assert "--unit=" in text
|
||||||
|
|
||||||
|
# No bwrap.
|
||||||
|
assert "bwrap" not in text
|
||||||
|
assert "bubblewrap" not in text
|
||||||
|
|
||||||
|
# UID drop via systemd directives.
|
||||||
|
assert "User=l4d2-sandbox" in text
|
||||||
|
assert "Group=l4d2-sandbox" in text
|
||||||
|
|
||||||
|
# Cgroup limits unchanged from v1.
|
||||||
assert "MemoryMax=4G" in text
|
assert "MemoryMax=4G" in text
|
||||||
assert "RuntimeMaxSec=3600" in text
|
assert "MemorySwapMax=0" in text
|
||||||
assert "TasksMax=512" in text
|
assert "TasksMax=512" in text
|
||||||
assert "bwrap" in text
|
assert "CPUQuota=200%" in text
|
||||||
assert "--unshare-pid" in text
|
assert "RuntimeMaxSec=3600" in text
|
||||||
assert "--unshare-net" not in text, "scripts must keep host network access"
|
|
||||||
# UID drop happens at systemd-run, not inside bwrap (modern bwrap requires
|
# Hardening directives that v1 (scope mode) couldn't carry.
|
||||||
# --unshare-user for --uid; doing the drop earlier keeps file ownership
|
assert "NoNewPrivileges=yes" in text
|
||||||
# straight on the host bind-mount).
|
assert "ProtectSystem=strict" in text
|
||||||
assert "--uid=l4d2-sandbox" in text
|
assert "ProtectHome=yes" in text
|
||||||
assert "--gid=l4d2-sandbox" in text
|
assert "PrivateTmp=yes" in text
|
||||||
|
assert "PrivateDevices=yes" in text
|
||||||
|
assert "PrivateIPC=yes" in text
|
||||||
|
assert "ProtectKernelTunables=yes" in text
|
||||||
|
assert "ProtectKernelModules=yes" in text
|
||||||
|
assert "ProtectKernelLogs=yes" in text
|
||||||
|
assert "ProtectControlGroups=yes" in text
|
||||||
|
assert "RestrictNamespaces=yes" in text
|
||||||
|
assert "RestrictSUIDSGID=yes" in text
|
||||||
|
assert "LockPersonality=yes" in text
|
||||||
|
assert "MemoryDenyWriteExecute=yes" in text
|
||||||
|
assert "SystemCallFilter=" in text
|
||||||
|
assert "@system-service" in text
|
||||||
|
assert "@network-io" in text
|
||||||
|
assert "CapabilityBoundingSet=" in text
|
||||||
|
assert "AmbientCapabilities=" in text
|
||||||
|
assert 'RestrictAddressFamilies="AF_INET AF_INET6 AF_UNIX"' in text
|
||||||
|
|
||||||
|
# Network namespace stays shared with host.
|
||||||
|
assert "PrivateNetwork=" not in text
|
||||||
|
|
||||||
|
# Mount setup: /etc and /var/lib masked with tmpfs; selective binds back.
|
||||||
|
assert 'TemporaryFileSystem="/etc /var/lib"' in text
|
||||||
|
assert "BindReadOnlyPaths=" in text
|
||||||
|
assert "/etc/resolv.conf" in text
|
||||||
|
assert "/etc/ssl" in text
|
||||||
|
assert "/etc/ca-certificates" in text
|
||||||
|
assert "/etc/nsswitch.conf" in text
|
||||||
|
assert "/etc/alternatives" in text
|
||||||
|
assert "${SCRIPT}:/script.sh" in text
|
||||||
|
assert 'BindPaths="${OVERLAY_DIR}:/overlay"' in text
|
||||||
|
|
||||||
|
|
||||||
def test_script_sandbox_helper_validates_overlay_id():
|
def test_script_sandbox_helper_validates_overlay_id():
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue