diff --git a/deploy/files/usr/local/lib/systemd/system/left4me-server@.service b/deploy/files/usr/local/lib/systemd/system/left4me-server@.service index 80158b8..9f64825 100644 --- a/deploy/files/usr/local/lib/systemd/system/left4me-server@.service +++ b/deploy/files/usr/local/lib/systemd/system/left4me-server@.service @@ -1,10 +1,21 @@ +# left4me gameserver — system unit, one instance per gameserver. +# +# This is the REFERENCE COPY of the deployed unit. The live source is +# the systemd/units reactor at ~/Projekte/ckn-bw/bundles/left4me/metadata.py +# (look for 'left4me-server@.service'). Hardening directives live in +# the HARDENING_SERVER constant near the top of the same file. +# This file is hand-synced; edit both together. +# +# Threat model: docs/superpowers/specs/2026-05-15-hardening-threat-model.md +# Defenses survey: docs/superpowers/specs/2026-05-15-hardening-defenses-survey.md +# Test plan + results: docs/superpowers/specs/2026-05-15-hardening-test-plan.md + [Unit] Description=left4me server instance %i After=network-online.target Wants=network-online.target # Bound the restart loop. Without these, a persistent ExecStartPre or -# ExecStart failure spins indefinitely. Note: these are [Unit]-section -# directives (systemd 230+), not [Service]. +# ExecStart failure spins indefinitely. StartLimitBurst=5 StartLimitIntervalSec=60s @@ -14,49 +25,25 @@ User=left4me Group=left4me EnvironmentFile=/etc/left4me/host.env EnvironmentFile=/var/lib/left4me/instances/%i/instance.env -# `-` prefix: chdir failure is non-fatal. systemd applies WorkingDirectory -# before every Exec line — including ExecStartPre — but the merged dir only -# exists once ExecStartPre's overlay mount succeeds. With `-`, ExecStartPre -# runs in the unit's home (cwd doesn't matter for the mount helper); the -# ExecStart re-applies WorkingDirectory after the mount and finds the dir. +# `-` prefix: chdir failure is non-fatal. The merged dir only exists +# once ExecStartPre's overlay mount succeeds. WorkingDirectory=-/var/lib/left4me/runtime/%i/merged/left4dead2 -# Single source of truth for the kernel-overlayfs mount lifecycle: the web -# app's start_instance only stages cfg files and asks systemd to enable+ -# start this unit; the actual `mount -t overlay` lives here so reboot -# auto-start works the same as a UI-driven start. ExecStopPost mirrors it -# so the unmount lives in the same place — no Python-side _mounter needed -# in stop/delete/reset paths. Both helper verbs are idempotent. -# -# `+` prefix runs the helper as PID 1 (root, no sandbox). Required because -# the unit has NoNewPrivileges=true, which blocks sudo's setuid escalation -# — and the helper itself needs root for the mount/umount syscalls. -# -# `nsenter --mount=/proc/1/ns/mnt --` runs the helper Python interpreter -# in PID 1's mount namespace. Without this, the `+` prefix removes the -# sandbox/credentials but does NOT detach from the unit's per-service -# mount namespace (created by PrivateTmp/Protect*) — so the helper -# process itself would hold a reference to that namespace, keeping the -# slave-mount tree alive after the cgroup empties, and umount in PID 1 -# would return EBUSY for as long as the helper ran. Putting nsenter at -# the unit-level (as opposed to inside the helper, where only the -# umount syscall escaped) is what actually frees the namespace. Once -# the helper is in PID 1's namespace, ExecStopPost's umount succeeds -# on the first try with no retry/race window. ExecStopPost (not -# ExecStop) so unmount runs after the cgroup is cleared; ExecStop runs -# while srcds is still alive and would EBUSY. +# `+` prefix runs the helper as PID 1 (root, all caps, host +# namespaces) — required because the unit has NoNewPrivileges=true +# AND PrivateUsers=true; both block sudo's setuid path. nsenter into +# PID 1's mount namespace ensures the umount in ExecStopPost succeeds +# without EBUSY from the unit's own slave-mount tree. ExecStartPre=+/usr/bin/nsenter --mount=/proc/1/ns/mnt -- /usr/local/libexec/left4me/left4me-overlay mount %i -# Run from the merged overlay, NOT installation/. srcds_run is a shell -# script that `cd`s to its own dirname before exec'ing srcds_linux, so the -# binary's path determines where the engine reads gameinfo.txt and addons -# from — WorkingDirectory has no effect. Invoking installation/srcds_run -# would resolve everything against the lower layer and never see overlay- -# provided plugins (Metamod/SourceMod) or cfgs (zonemod, confogl). +# Run from the merged overlay, NOT installation/. srcds_run cds to its +# own dirname before exec'ing srcds_linux; the binary's path determines +# gameinfo + addons lookup. ExecStart=/var/lib/left4me/runtime/%i/merged/srcds_run -game left4dead2 +hostport ${L4D2_PORT} $L4D2_ARGS ExecStopPost=+/usr/bin/nsenter --mount=/proc/1/ns/mnt -- /usr/local/libexec/left4me/left4me-overlay umount %i Restart=on-failure RestartSec=5 -# Resource control baseline — see docs/superpowers/specs/2026-05-09-l4d2-server-host-perf-baseline-design.md +# === Resource control baseline === +# See docs/superpowers/specs/2026-05-09-l4d2-server-host-perf-baseline-design.md Slice=l4d2-game.slice Nice=-5 IOSchedulingClass=best-effort @@ -70,16 +57,72 @@ KillSignal=SIGINT TimeoutStopSec=15s LogRateLimitIntervalSec=0 -# Hardening (unchanged from previous baseline). -NoNewPrivileges=true +# === Identity / privilege drop === +NoNewPrivileges=true # block setuid escalation (defense: D3) +RestrictSUIDSGID=true # block setuid()/setgid() syscalls +CapabilityBoundingSet= # drop all caps — no privilege to escalate +AmbientCapabilities= + +# === Filesystem virtualization === +# Mask /var/lib, /etc, /opt, etc. with empty tmpfs; bind back only +# what srcds needs. The DB (/var/lib/left4me/left4me.db) and web.env +# (/etc/left4me/web.env) are intentionally not bound — they don't +# exist in this unit's filesystem view (defenses: D1.a, D1.b). +TemporaryFileSystem=/var/lib /etc /opt /home /root /srv /mnt /media +BindReadOnlyPaths=/var/lib/left4me/installation +BindReadOnlyPaths=/var/lib/left4me/overlays +BindReadOnlyPaths=/etc/left4me/host.env +BindReadOnlyPaths=/etc/ssl +BindReadOnlyPaths=/etc/ca-certificates +BindReadOnlyPaths=/etc/resolv.conf +BindReadOnlyPaths=/etc/nsswitch.conf +BindReadOnlyPaths=/etc/alternatives +BindPaths=/var/lib/left4me/runtime/%i +ProtectSystem=strict # belt-and-braces with TemporaryFileSystem +ProtectHome=true + +# === Process namespacing === +PrivateUsers=true # own user namespace; cross-uid ptrace blocked (D2) +PrivatePIDs=true # own PID namespace; hides peer-srcds + gunicorn (D2.b, D5) PrivateTmp=true PrivateDevices=true -ProtectHome=true -ProtectSystem=strict -ReadOnlyPaths=/var/lib/left4me/installation /var/lib/left4me/overlays -ReadWritePaths=/var/lib/left4me/runtime/%i -RestrictSUIDSGID=true -LockPersonality=true +PrivateIPC=true +RestrictNamespaces=true # block unshare()/clone(CLONE_NEW*) + +# === /proc and /sys === +ProtectProc=invisible # foreign-uid /proc hidden (paired with PrivatePIDs for full hide) +ProcSubset=pid # /proc shows only PID dirs, no kallsyms/cpuinfo +ProtectKernelTunables=true # /proc/sys, /sys read-only +ProtectKernelModules=true # no module load/unload +ProtectKernelLogs=true # no /dev/kmsg or syslog() +ProtectClock=true # no settimeofday() +ProtectControlGroups=true # /sys/fs/cgroup read-only +ProtectHostname=true # no sethostname() +LockPersonality=true # no personality() switches + +# === Syscall filter === +# srcds_linux is i386 (Source 2007 engine). 'native x86' allows both +# x86_64 (from srcds_run + the dynamic linker) and i386 (from srcds_linux). +# Bare 'native' traps srcds_run in a respawn loop. +SystemCallArchitectures=native x86 +SystemCallFilter=@system-service +SystemCallFilter=~@debug @mount @raw-io @reboot @swap @cpu-emulation @obsolete @privileged +# ~@debug is the load-bearing block for D2.a: drops ptrace(), process_vm_readv/writev(). +# ~@privileged blocks anything requiring CAP_*, redundant with empty bounding set. +# MemoryDenyWriteExecute=true is NOT set — Source engine i386 .so files +# have text relocations that need mprotect(W+X) during dynamic-linker pass. + +# === Network === +RestrictAddressFamilies=AF_INET AF_INET6 AF_UNIX # AF_UNIX needed for journald +# Lock srcds bindable sockets to the game port range. +SocketBindAllow=udp:27000-27999 +SocketBindAllow=tcp:27000-27999 + +# === Misc hygiene === +RestrictRealtime=true # no real-time scheduling +RemoveIPC=true # clean up SysV IPC on unit stop +KeyringMode=private # private kernel keyring +UMask=0027 [Install] WantedBy=multi-user.target diff --git a/deploy/files/usr/local/lib/systemd/system/left4me-web.service b/deploy/files/usr/local/lib/systemd/system/left4me-web.service index 46b375e..52f869e 100644 --- a/deploy/files/usr/local/lib/systemd/system/left4me-web.service +++ b/deploy/files/usr/local/lib/systemd/system/left4me-web.service @@ -1,3 +1,25 @@ +# left4me web application — system unit. +# +# This is the REFERENCE COPY of the deployed unit. The live source is +# the systemd/units reactor at ~/Projekte/ckn-bw/bundles/left4me/metadata.py +# (look for 'left4me-web.service'). Hardening directives live in +# the HARDENING_WEB constant near the top of the same file. +# This file is hand-synced; edit both together. +# +# Several directives that the gameserver uses are intentionally absent +# from this unit: +# NoNewPrivileges — blocks sudo's setuid escalation +# PrivateUsers — breaks sudo's host-root mapping +# RestrictSUIDSGID — blocks setuid()/setgid() +# CapabilityBoundingSet= — empty value would deny sudo's caps +# ~@privileged in SystemCallFilter — blocks sudo's setuid syscall +# The web app invokes privileged helpers (left4me-systemctl, +# left4me-overlay, left4me-script-sandbox) via sudo, so these +# directives can't be applied here. A future refactor replacing sudo +# with systemctl-managed transient units would unlock them. +# +# Threat model + defenses + tests: see docs/superpowers/specs/2026-05-15-hardening-* + [Unit] Description=left4me web application After=network-online.target @@ -7,25 +29,53 @@ Wants=network-online.target Type=simple User=left4me Group=left4me -WorkingDirectory=/opt/left4me -Environment=HOME=/var/lib/left4me -Environment=PATH=/opt/left4me/.venv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +WorkingDirectory=/opt/left4me/src +Environment=HOME=/var/lib/left4me PATH=/opt/left4me/.venv/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin EnvironmentFile=/etc/left4me/host.env EnvironmentFile=/etc/left4me/web.env -ExecStart=/opt/left4me/.venv/bin/gunicorn --workers 1 --threads 32 --bind 0.0.0.0:8000 'l4d2web.app:create_app()' +# Placeholder values for --workers / --threads. Live emission interpolates +# from metadata.get('left4me/gunicorn_workers') and gunicorn_threads. +ExecStart=/opt/left4me/.venv/bin/gunicorn --workers 4 --threads 4 --bind 127.0.0.1:8000 'l4d2web.app:create_app()' Restart=on-failure RestartSec=3 -# NoNewPrivileges intentionally not set: the worker invokes sudo to run -# the left4me-systemctl, left4me-journalctl, and left4me-overlay -# privileged helpers, all setuid via sudo. -# ProtectSystem=full + ReadWritePaths implicitly give this unit a private -# mount namespace, but mount visibility no longer depends on it: overlay -# mounts are performed by the left4me-overlay helper, which nsenters into -# PID 1's mount namespace, so the resulting mount lives in the host -# namespace where the per-instance gameserver units can see it. -ProtectSystem=full + +# Web writes broadly under /var/lib/left4me (DB, instance configs, +# overlays, runtime). Kept inline because it's web-specific +# (server@ uses BindPaths to bind only its instance dir). ReadWritePaths=/var/lib/left4me + +# === Filesystem === +ProtectSystem=strict # tightened from prior 'full'; via HARDENING_COMMON +ProtectHome=true PrivateTmp=true +# === /proc + kernel === +ProtectProc=invisible # foreign-uid /proc hidden (defense: D4) +ProcSubset=pid +ProtectKernelTunables=true +ProtectKernelModules=true +ProtectKernelLogs=true +ProtectClock=true +ProtectControlGroups=true +ProtectHostname=true +LockPersonality=true + +# === Syscall filter (sudo-compatible — note absence of ~@privileged) === +SystemCallArchitectures=native +SystemCallFilter=@system-service +SystemCallFilter=~@debug @mount @raw-io @reboot @swap @cpu-emulation @obsolete +# ~@debug blocks ptrace + process_vm_readv/writev (D4). +# ~@privileged intentionally omitted — sudo needs setuid(). + +# === Network === +RestrictAddressFamilies=AF_INET AF_INET6 AF_UNIX + +# === Misc hygiene === +RestrictNamespaces=true +RestrictRealtime=true +RemoveIPC=true +KeyringMode=private +UMask=0027 + [Install] WantedBy=multi-user.target