left4me: vendor privileged helpers + sudoers/sysctl/sandbox-resolv

Copied verbatim from left4me/deploy/files/. Helpers are the trust unit
the sudoers rules grant access to; left as static files (not generated)
so the audit trail stays grep-able. Modes/owners are set via items.py
in the next commit.
This commit is contained in:
CroneKorkN 2026-05-10 17:10:17 +02:00
parent 7547d041a2
commit 6db792ce6a
Signed by: cronekorkn
SSH key fingerprint: SHA256:v0410ZKfuO1QHdgKBsdQNF64xmTxOF8osF1LIqwTcVw
7 changed files with 468 additions and 0 deletions

View file

@ -0,0 +1,6 @@
# Sandbox-only resolver config — bind-mounted into script-overlay sandboxes
# at /etc/resolv.conf. The host's resolver (often a private/LAN DNS server)
# is unreachable from inside the sandbox because IPAddressDeny= blocks
# egress to RFC1918 / loopback. Public resolvers keep DNS working.
nameserver 1.1.1.1
nameserver 8.8.8.8

View file

@ -0,0 +1,5 @@
Defaults:left4me !requiretty
left4me ALL=(root) NOPASSWD: /usr/local/libexec/left4me/left4me-systemctl *
left4me ALL=(root) NOPASSWD: /usr/local/libexec/left4me/left4me-journalctl *
left4me ALL=(root) NOPASSWD: /usr/local/libexec/left4me/left4me-overlay mount *, /usr/local/libexec/left4me/left4me-overlay umount *
left4me ALL=(root) NOPASSWD: /usr/local/libexec/left4me/left4me-script-sandbox

View file

@ -0,0 +1,36 @@
# Host-side perf baseline for left4me — see
# docs/superpowers/specs/2026-05-09-l4d2-server-host-perf-baseline-design.md
#
# UDP socket buffers: distro defaults of ~128 KiB are too small for sustained
# Source-engine UDP across multiple instances. 8 MiB matches the standard
# 1 Gbit recommendation; rmem_default/wmem_default protect sockets that don't
# explicitly enlarge their buffers.
net.core.rmem_max = 8388608
net.core.wmem_max = 8388608
net.core.rmem_default = 524288
net.core.wmem_default = 524288
# Kernel softirq UDP path: the per-CPU backlog queue starts dropping packets
# at the default 1000 under multi-instance burst; 5000 absorbs realistic peaks.
# netdev_budget = 600 gives softirq more drain headroom per pass.
net.core.netdev_max_backlog = 5000
net.core.netdev_budget = 600
# Latency-sensitive default: avoid swap unless the box is really under
# pressure. Harmless on swapless hosts.
vm.swappiness = 10
# Per-socket UDP buffer floors: protect game-server sockets that don't bump
# their own SO_RCVBUF/SO_SNDBUF when softirq drains lag briefly.
net.ipv4.udp_rmem_min = 16384
net.ipv4.udp_wmem_min = 16384
# Default qdisc for ifaces we don't explicitly shape with CAKE. Debian Trixie
# already defaults to fq_codel; setting it explicitly is belt-and-suspenders
# and survives kernel-default churn.
net.core.default_qdisc = fq_codel
# TCP congestion control: BBR for any bulk TCP egress on the host (admin SSH,
# backups, package fetches, web-app responses) so a long flow does not push
# the bottleneck queue ahead of game UDP. UDP srcds is unaffected.
net.ipv4.tcp_congestion_control = bbr

View file

@ -0,0 +1,53 @@
#!/bin/sh
set -eu
usage() {
printf '%s\n' "usage: left4me-journalctl <server-name> --lines <n> --follow|--no-follow" >&2
exit 2
}
validate_name() {
name=$1
[ -n "$name" ] || usage
case "$name" in
.*|*..*|*/*|*\\*) usage ;;
esac
case "$name" in
*[!A-Za-z0-9_.-]*) usage ;;
esac
}
[ "$#" -eq 4 ] || usage
name=$1
lines_flag=$2
lines=$3
follow_flag=$4
validate_name "$name"
[ "$lines_flag" = "--lines" ] || usage
case "$lines" in
''|*[!0-9]*) usage ;;
esac
follow_arg=
case "$follow_flag" in
--follow) follow_arg=-f ;;
--no-follow) ;;
*) usage ;;
esac
unit="left4me-server@${name}.service"
if [ -x /bin/journalctl ]; then
journalctl=/bin/journalctl
elif [ -x /usr/bin/journalctl ]; then
journalctl=/usr/bin/journalctl
else
printf '%s\n' 'journalctl not found at /bin/journalctl or /usr/bin/journalctl' >&2
exit 69
fi
if [ -n "$follow_arg" ]; then
exec "$journalctl" -u "$unit" -n "$lines" -o cat "$follow_arg"
fi
exec "$journalctl" -u "$unit" -n "$lines" -o cat

View file

@ -0,0 +1,242 @@
#!/usr/bin/python3
"""Privileged overlay mount helper for left4me.
Invoked from the systemd unit's ExecStartPre / ExecStopPost via
`+/usr/bin/nsenter --mount=/proc/1/ns/mnt -- …`. The unit-level
nsenter is what makes this work: it runs the helper Python interpreter
inside PID 1's mount namespace. Without it, the `+` Exec prefix
removes the sandbox/credentials but does NOT detach from the unit's
per-service mount namespace, and the helper process itself would pin
that namespace alive — turning every umount into a multi-second EBUSY
race with the kernel's deferred namespace cleanup. With the unit-level
nsenter the helper has no such reference and umount succeeds first try.
Validates inputs strictly, then performs `mount -t overlay` /
`umount` directly — no internal nsenter, since the helper is already
running where the syscalls need to take effect.
Verbs:
mount <name> Reads ${LEFT4ME_ROOT}/instances/<name>/instance.env
for L4D2_LOWERDIRS, validates every lowerdir is
under one of installation/overlays/workshop_cache/
global_overlay_cache, then mounts the kernel
overlay at runtime/<name>/merged.
umount <name> Unmounts runtime/<name>/merged and cleans up the
kernel-overlayfs `work/work` orphan.
Set LEFT4ME_OVERLAY_PRINT_ONLY=1 to print the would-be argv (one line,
shell-quoted) and exit 0 instead of execv. Used by tests.
"""
import os
import re
import shlex
import shutil
import subprocess
import sys
from pathlib import Path
NAME_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,63}$")
DEFAULT_ROOT = "/var/lib/left4me"
LOWERDIR_ALLOWLIST = (
"installation",
"overlays",
"global_overlay_cache",
"workshop_cache",
)
MAX_LOWERDIRS = 500
MOUNT_BIN = "/bin/mount"
UMOUNT_BIN = "/bin/umount"
def die(msg: str) -> None:
sys.stderr.write(f"left4me-overlay: {msg}\n")
sys.exit(1)
def root() -> Path:
return Path(os.environ.get("LEFT4ME_ROOT") or DEFAULT_ROOT)
def validate_name(name: str) -> str:
if not NAME_RE.fullmatch(name):
die(f"invalid instance name: {name!r}")
return name
def parse_lowerdirs(env_path: Path) -> list[str]:
if not env_path.is_file():
die(f"instance.env not found: {env_path}")
raw = None
for line in env_path.read_text().splitlines():
if "=" not in line:
continue
key, value = line.split("=", 1)
if key.strip() == "L4D2_LOWERDIRS":
raw = value
break
if raw is None:
die(f"L4D2_LOWERDIRS not set in {env_path}")
if raw == "":
die(f"L4D2_LOWERDIRS is empty in {env_path}")
parts = raw.split(":")
if any(p == "" for p in parts):
die(f"L4D2_LOWERDIRS contains an empty entry: {raw!r}")
if len(parts) > MAX_LOWERDIRS:
die(f"L4D2_LOWERDIRS has {len(parts)} entries (cap {MAX_LOWERDIRS})")
return parts
def canonical_under(allowed_roots: list[Path], path: Path) -> Path:
try:
canonical = path.resolve(strict=True)
except (FileNotFoundError, RuntimeError):
die(f"path does not exist or has a symlink loop: {path}")
for r in allowed_roots:
if canonical == r or r in canonical.parents:
return canonical
die(f"path is outside the permitted roots: {path} (resolved: {canonical})")
_LISTXATTR = getattr(os, "listxattr", None)
def _entry_has_fuse_xattr(path: str) -> str | None:
if _LISTXATTR is None:
return None
try:
attrs = _LISTXATTR(path, follow_symlinks=False)
except OSError:
return None
for a in attrs:
if a.startswith("user.fuseoverlayfs."):
return a
return None
def assert_no_fuse_xattrs(upper: Path) -> None:
if not upper.exists() or _LISTXATTR is None:
return
for dirpath, dirnames, filenames in os.walk(upper):
for entry in (dirpath, *(os.path.join(dirpath, n) for n in dirnames),
*(os.path.join(dirpath, n) for n in filenames)):
tainted = _entry_has_fuse_xattr(entry)
if tainted:
die(
f"upperdir contains fuse-overlayfs xattr {tainted!r} on {entry}; "
"wipe upper/ and work/ before mounting"
)
def exec_or_print(argv: list[str]) -> None:
if os.environ.get("LEFT4ME_OVERLAY_PRINT_ONLY") == "1":
print(" ".join(shlex.quote(a) for a in argv))
sys.exit(0)
os.execv(argv[0], argv)
def cmd_mount(name: str) -> None:
name = validate_name(name)
r = root()
runtime_name_dir = (r / "runtime" / name).resolve(strict=True)
merged_for_check = (runtime_name_dir / "merged").resolve(strict=True)
# Idempotency for unit restart cycles: if a previous start mounted
# successfully but ExecStart failed afterwards (and Restart=on-failure
# fires another cycle), the second ExecStartPre would otherwise refuse
# to mount-on-top. Short-circuit here so the second cycle just gets
# straight to ExecStart. PRINT_ONLY (test mode) bypasses this so the
# tests can exercise the full nsenter argv regardless of mount state.
if (
os.environ.get("LEFT4ME_OVERLAY_PRINT_ONLY") != "1"
and os.path.ismount(merged_for_check)
):
return
instance_env = r / "instances" / name / "instance.env"
raw_lowerdirs = parse_lowerdirs(instance_env)
allowed_roots = [(r / sub).resolve() for sub in LOWERDIR_ALLOWLIST]
canonical_lowerdirs = [str(canonical_under(allowed_roots, Path(p))) for p in raw_lowerdirs]
upper = (runtime_name_dir / "upper").resolve(strict=True)
work = (runtime_name_dir / "work").resolve(strict=True)
merged = merged_for_check
for label, path in (("upper", upper), ("work", work), ("merged", merged)):
if path.parent != runtime_name_dir:
die(f"{label} resolved outside runtime/{name}: {path}")
assert_no_fuse_xattrs(upper)
options = f"lowerdir={':'.join(canonical_lowerdirs)},upperdir={upper},workdir={work}"
argv = [
MOUNT_BIN,
"-t", "overlay",
"overlay",
"-o", options,
str(merged),
]
exec_or_print(argv)
def cmd_umount(name: str) -> None:
name = validate_name(name)
r = root()
runtime_name_dir = (r / "runtime" / name).resolve(strict=True)
merged_path = runtime_name_dir / "merged"
work_inner = runtime_name_dir / "work" / "work"
argv = [
UMOUNT_BIN,
# Resolve only if it exists; PRINT_ONLY tests always pre-create it.
str(merged_path.resolve(strict=True) if merged_path.exists() else merged_path),
]
# PRINT_ONLY: emit the umount argv and exit. Tests assert exact shape
# of this dry-run; the post-umount cleanup of work_inner is a runtime
# behaviour exercised on the host, not in unit tests.
if os.environ.get("LEFT4ME_OVERLAY_PRINT_ONLY") == "1":
print(" ".join(shlex.quote(a) for a in argv))
sys.exit(0)
if merged_path.exists():
merged = merged_path.resolve(strict=True)
if merged.parent != runtime_name_dir:
die(f"merged resolved outside runtime/{name}: {merged}")
# Idempotency: only umount if currently a mount point. Mirrors
# cmd_mount's symmetric check; a redundant cleanup pass — or a
# call after a partial _purge_instance — must be a no-op.
#
# No retry loop here: with the helper running in PID 1's mount
# namespace (via the unit-level `nsenter --mount=/proc/1/ns/mnt`
# in ExecStopPost), it holds no reference to the unit's
# per-service mount namespace, so the cgroup-empty → namespace
# reaped → umount-clears sequence happens without any race
# window for us to ride out. EBUSY here is a real error.
if os.path.ismount(merged):
subprocess.run(argv, check=True)
# Kernel-overlayfs creates work_inner during mount with root:root mode
# 0/0. After unmount it's an orphan that the unit's User= (left4me)
# cannot traverse via shutil.rmtree, so reset/delete in instances.py
# blows up with EACCES on `runtime/<name>/work/work`. The helper is
# the only code path with root that knows about this directory, so
# the cleanup belongs here. Safe to nuke — the kernel re-creates it
# on the next mount. Run unconditionally — covers both "we just
# unmounted" and "previous teardown didn't finish" cases.
if work_inner.exists():
shutil.rmtree(work_inner)
def main(argv: list[str]) -> None:
if len(argv) != 3 or argv[1] not in ("mount", "umount"):
sys.stderr.write("usage: left4me-overlay mount|umount <name>\n")
sys.exit(2)
if argv[1] == "mount":
cmd_mount(argv[2])
else:
cmd_umount(argv[2])
if __name__ == "__main__":
main(sys.argv)

View file

@ -0,0 +1,82 @@
#!/bin/bash
# Privileged sandbox launcher for left4me script overlays.
#
# Invoked via sudo by the web user with two arguments:
# <overlay_id> numeric overlay id; bind-mounts /var/lib/left4me/overlays/<id>
# read-write at /overlay inside the sandbox.
# <script_path> absolute path to a bash file already written by the web app;
# bind-mounted read-only at /script.sh inside the sandbox.
#
# The script runs as a transient systemd .service with the full hardening
# surface: cgroup limits + walltime kill, NoNewPrivileges, ProtectSystem,
# ProtectHome, kernel-tunable / -module / -log protection, namespace
# restriction, address-family restriction, capability bounding (empty),
# seccomp filter (@system-service @network-io), MemoryDenyWriteExecute,
# LockPersonality, RestrictSUIDSGID. Network namespace is *not* restricted —
# scripts must reach the public internet to download workshop / l4d2center
# / cedapug content. PID namespace is shared with the host (no
# PrivatePID= directive in systemd); host PIDs are visible via /proc but
# not signal-able due to UID mismatch.
set -euo pipefail
[[ $# -eq 2 ]] || { echo "usage: $0 <overlay_id> <script>" >&2; exit 64; }
OVERLAY_ID=$1
SCRIPT=$2
[[ "$OVERLAY_ID" =~ ^[0-9]+$ ]] || { echo "bad overlay id" >&2; exit 64; }
OVERLAY_DIR=/var/lib/left4me/overlays/$OVERLAY_ID
[[ -d $OVERLAY_DIR ]] || { echo "no overlay dir at $OVERLAY_DIR" >&2; exit 65; }
[[ -f $SCRIPT ]] || { echo "no script at $SCRIPT" >&2; exit 65; }
if [[ "${LEFT4ME_SCRIPT_SANDBOX_DRY_RUN:-}" == "1" ]]; then
echo "DRY RUN: overlay_id=$OVERLAY_ID script=$SCRIPT overlay_dir=$OVERLAY_DIR"
exit 0
fi
# Make sure the sandbox UID owns the overlay dir so the script can write there.
# Idempotent: a no-op when the dir is already l4d2-sandbox-owned (re-run case),
# and corrects the ownership the first time the dir was created by the web app
# under the left4me UID. World-readable so the gameserver process (left4me)
# can read the overlay contents via the kernel-overlayfs lowerdir at runtime.
chown -R l4d2-sandbox:l4d2-sandbox "$OVERLAY_DIR"
chmod 0755 "$OVERLAY_DIR"
SCRIPT_RC=0
systemd-run --quiet --collect --wait --pipe \
--unit="left4me-script-${OVERLAY_ID}-$$" \
--slice=l4d2-build.slice \
-p OOMScoreAdjust=500 \
-p User=l4d2-sandbox -p Group=l4d2-sandbox \
-p UMask=0022 \
-p NoNewPrivileges=yes \
-p ProtectSystem=strict -p ProtectHome=yes \
-p PrivateTmp=yes -p PrivateDevices=yes -p PrivateIPC=yes \
-p ProtectKernelTunables=yes -p ProtectKernelModules=yes \
-p ProtectKernelLogs=yes -p ProtectControlGroups=yes \
-p RestrictNamespaces=yes \
-p RestrictAddressFamilies="AF_INET AF_INET6 AF_UNIX" \
-p RestrictSUIDSGID=yes -p LockPersonality=yes \
-p MemoryDenyWriteExecute=yes \
-p SystemCallFilter="@system-service @network-io" \
-p SystemCallArchitectures=native \
-p CapabilityBoundingSet= -p AmbientCapabilities= \
-p IPAddressDeny="127.0.0.0/8 ::1/128 169.254.0.0/16 fe80::/10 224.0.0.0/4 ff00::/8 10.0.0.0/8 172.16.0.0/12 192.168.0.0/16 100.64.0.0/10 fc00::/7" \
-p TemporaryFileSystem="/etc /var/lib" \
-p BindReadOnlyPaths="/etc/left4me/sandbox-resolv.conf:/etc/resolv.conf /etc/ssl /etc/ca-certificates /etc/nsswitch.conf /etc/alternatives ${SCRIPT}:/script.sh" \
-p BindPaths="${OVERLAY_DIR}:/overlay" \
-p WorkingDirectory=/overlay \
-p Environment="HOME=/tmp PATH=/usr/bin:/usr/sbin OVERLAY=/overlay" \
-p MemoryMax=4G -p MemorySwapMax=0 -p TasksMax=512 \
-p CPUQuota=200% -p RuntimeMaxSec=3600 \
-- /bin/bash /script.sh || SCRIPT_RC=$?
# Normalize perms so the web service (left4me uid) can read overlay files
# directly via Python open() — needed by the file tree's download endpoint.
# UMask=0022 above takes care of *new* writes; this catches anything the
# script created with a tighter mode (e.g. cedapug_maps writes its
# .cedapug/manifest.tsv as 0600 by default).
find "$OVERLAY_DIR" -type f ! -perm -o+r -exec chmod o+r {} + 2>/dev/null || true
find "$OVERLAY_DIR" -type d ! -perm -o+rx -exec chmod o+rx {} + 2>/dev/null || true
exit $SCRIPT_RC

View file

@ -0,0 +1,44 @@
#!/bin/sh
set -eu
usage() {
printf '%s\n' "usage: left4me-systemctl enable|disable|show <server-name>" >&2
exit 2
}
validate_name() {
name=$1
[ -n "$name" ] || usage
case "$name" in
.*|*..*|*/*|*\\*) usage ;;
esac
case "$name" in
*[!A-Za-z0-9_.-]*) usage ;;
esac
}
[ "$#" -eq 2 ] || usage
action=$1
name=$2
case "$action" in
enable|disable|show) ;;
*) usage ;;
esac
validate_name "$name"
unit="left4me-server@${name}.service"
if [ -x /bin/systemctl ]; then
systemctl=/bin/systemctl
elif [ -x /usr/bin/systemctl ]; then
systemctl=/usr/bin/systemctl
else
printf '%s\n' 'systemctl not found at /bin/systemctl or /usr/bin/systemctl' >&2
exit 69
fi
case "$action" in
enable) exec "$systemctl" enable --now "$unit" ;;
disable) exec "$systemctl" disable --now "$unit" ;;
show) exec "$systemctl" show --property=ActiveState --property=SubState "$unit" ;;
esac