From 6db792ce6a9fa56ce938293066593d55f14f24ce Mon Sep 17 00:00:00 2001 From: CroneKorkN Date: Sun, 10 May 2026 17:10:17 +0200 Subject: [PATCH] left4me: vendor privileged helpers + sudoers/sysctl/sandbox-resolv Copied verbatim from left4me/deploy/files/. Helpers are the trust unit the sudoers rules grant access to; left as static files (not generated) so the audit trail stays grep-able. Modes/owners are set via items.py in the next commit. --- .../files/etc/left4me/sandbox-resolv.conf | 6 + bundles/left4me/files/etc/sudoers.d/left4me | 5 + .../files/etc/sysctl.d/99-left4me.conf | 36 +++ .../local/libexec/left4me/left4me-journalctl | 53 ++++ .../usr/local/libexec/left4me/left4me-overlay | 242 ++++++++++++++++++ .../libexec/left4me/left4me-script-sandbox | 82 ++++++ .../local/libexec/left4me/left4me-systemctl | 44 ++++ 7 files changed, 468 insertions(+) create mode 100644 bundles/left4me/files/etc/left4me/sandbox-resolv.conf create mode 100644 bundles/left4me/files/etc/sudoers.d/left4me create mode 100644 bundles/left4me/files/etc/sysctl.d/99-left4me.conf create mode 100755 bundles/left4me/files/usr/local/libexec/left4me/left4me-journalctl create mode 100644 bundles/left4me/files/usr/local/libexec/left4me/left4me-overlay create mode 100755 bundles/left4me/files/usr/local/libexec/left4me/left4me-script-sandbox create mode 100755 bundles/left4me/files/usr/local/libexec/left4me/left4me-systemctl diff --git a/bundles/left4me/files/etc/left4me/sandbox-resolv.conf b/bundles/left4me/files/etc/left4me/sandbox-resolv.conf new file mode 100644 index 0000000..bd86c70 --- /dev/null +++ b/bundles/left4me/files/etc/left4me/sandbox-resolv.conf @@ -0,0 +1,6 @@ +# Sandbox-only resolver config — bind-mounted into script-overlay sandboxes +# at /etc/resolv.conf. The host's resolver (often a private/LAN DNS server) +# is unreachable from inside the sandbox because IPAddressDeny= blocks +# egress to RFC1918 / loopback. Public resolvers keep DNS working. +nameserver 1.1.1.1 +nameserver 8.8.8.8 diff --git a/bundles/left4me/files/etc/sudoers.d/left4me b/bundles/left4me/files/etc/sudoers.d/left4me new file mode 100644 index 0000000..5aa94eb --- /dev/null +++ b/bundles/left4me/files/etc/sudoers.d/left4me @@ -0,0 +1,5 @@ +Defaults:left4me !requiretty +left4me ALL=(root) NOPASSWD: /usr/local/libexec/left4me/left4me-systemctl * +left4me ALL=(root) NOPASSWD: /usr/local/libexec/left4me/left4me-journalctl * +left4me ALL=(root) NOPASSWD: /usr/local/libexec/left4me/left4me-overlay mount *, /usr/local/libexec/left4me/left4me-overlay umount * +left4me ALL=(root) NOPASSWD: /usr/local/libexec/left4me/left4me-script-sandbox diff --git a/bundles/left4me/files/etc/sysctl.d/99-left4me.conf b/bundles/left4me/files/etc/sysctl.d/99-left4me.conf new file mode 100644 index 0000000..0860833 --- /dev/null +++ b/bundles/left4me/files/etc/sysctl.d/99-left4me.conf @@ -0,0 +1,36 @@ +# Host-side perf baseline for left4me — see +# docs/superpowers/specs/2026-05-09-l4d2-server-host-perf-baseline-design.md +# +# UDP socket buffers: distro defaults of ~128 KiB are too small for sustained +# Source-engine UDP across multiple instances. 8 MiB matches the standard +# 1 Gbit recommendation; rmem_default/wmem_default protect sockets that don't +# explicitly enlarge their buffers. +net.core.rmem_max = 8388608 +net.core.wmem_max = 8388608 +net.core.rmem_default = 524288 +net.core.wmem_default = 524288 + +# Kernel softirq UDP path: the per-CPU backlog queue starts dropping packets +# at the default 1000 under multi-instance burst; 5000 absorbs realistic peaks. +# netdev_budget = 600 gives softirq more drain headroom per pass. +net.core.netdev_max_backlog = 5000 +net.core.netdev_budget = 600 + +# Latency-sensitive default: avoid swap unless the box is really under +# pressure. Harmless on swapless hosts. +vm.swappiness = 10 + +# Per-socket UDP buffer floors: protect game-server sockets that don't bump +# their own SO_RCVBUF/SO_SNDBUF when softirq drains lag briefly. +net.ipv4.udp_rmem_min = 16384 +net.ipv4.udp_wmem_min = 16384 + +# Default qdisc for ifaces we don't explicitly shape with CAKE. Debian Trixie +# already defaults to fq_codel; setting it explicitly is belt-and-suspenders +# and survives kernel-default churn. +net.core.default_qdisc = fq_codel + +# TCP congestion control: BBR for any bulk TCP egress on the host (admin SSH, +# backups, package fetches, web-app responses) so a long flow does not push +# the bottleneck queue ahead of game UDP. UDP srcds is unaffected. +net.ipv4.tcp_congestion_control = bbr diff --git a/bundles/left4me/files/usr/local/libexec/left4me/left4me-journalctl b/bundles/left4me/files/usr/local/libexec/left4me/left4me-journalctl new file mode 100755 index 0000000..2e5d3df --- /dev/null +++ b/bundles/left4me/files/usr/local/libexec/left4me/left4me-journalctl @@ -0,0 +1,53 @@ +#!/bin/sh +set -eu + +usage() { + printf '%s\n' "usage: left4me-journalctl --lines --follow|--no-follow" >&2 + exit 2 +} + +validate_name() { + name=$1 + [ -n "$name" ] || usage + case "$name" in + .*|*..*|*/*|*\\*) usage ;; + esac + case "$name" in + *[!A-Za-z0-9_.-]*) usage ;; + esac +} + +[ "$#" -eq 4 ] || usage +name=$1 +lines_flag=$2 +lines=$3 +follow_flag=$4 + +validate_name "$name" +[ "$lines_flag" = "--lines" ] || usage +case "$lines" in + ''|*[!0-9]*) usage ;; +esac + +follow_arg= +case "$follow_flag" in + --follow) follow_arg=-f ;; + --no-follow) ;; + *) usage ;; +esac + +unit="left4me-server@${name}.service" +if [ -x /bin/journalctl ]; then + journalctl=/bin/journalctl +elif [ -x /usr/bin/journalctl ]; then + journalctl=/usr/bin/journalctl +else + printf '%s\n' 'journalctl not found at /bin/journalctl or /usr/bin/journalctl' >&2 + exit 69 +fi + +if [ -n "$follow_arg" ]; then + exec "$journalctl" -u "$unit" -n "$lines" -o cat "$follow_arg" +fi + +exec "$journalctl" -u "$unit" -n "$lines" -o cat diff --git a/bundles/left4me/files/usr/local/libexec/left4me/left4me-overlay b/bundles/left4me/files/usr/local/libexec/left4me/left4me-overlay new file mode 100644 index 0000000..ccbb4c0 --- /dev/null +++ b/bundles/left4me/files/usr/local/libexec/left4me/left4me-overlay @@ -0,0 +1,242 @@ +#!/usr/bin/python3 +"""Privileged overlay mount helper for left4me. + +Invoked from the systemd unit's ExecStartPre / ExecStopPost via +`+/usr/bin/nsenter --mount=/proc/1/ns/mnt -- …`. The unit-level +nsenter is what makes this work: it runs the helper Python interpreter +inside PID 1's mount namespace. Without it, the `+` Exec prefix +removes the sandbox/credentials but does NOT detach from the unit's +per-service mount namespace, and the helper process itself would pin +that namespace alive — turning every umount into a multi-second EBUSY +race with the kernel's deferred namespace cleanup. With the unit-level +nsenter the helper has no such reference and umount succeeds first try. + +Validates inputs strictly, then performs `mount -t overlay` / +`umount` directly — no internal nsenter, since the helper is already +running where the syscalls need to take effect. + +Verbs: + mount Reads ${LEFT4ME_ROOT}/instances//instance.env + for L4D2_LOWERDIRS, validates every lowerdir is + under one of installation/overlays/workshop_cache/ + global_overlay_cache, then mounts the kernel + overlay at runtime//merged. + umount Unmounts runtime//merged and cleans up the + kernel-overlayfs `work/work` orphan. + +Set LEFT4ME_OVERLAY_PRINT_ONLY=1 to print the would-be argv (one line, +shell-quoted) and exit 0 instead of execv. Used by tests. +""" + +import os +import re +import shlex +import shutil +import subprocess +import sys +from pathlib import Path + +NAME_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,63}$") +DEFAULT_ROOT = "/var/lib/left4me" +LOWERDIR_ALLOWLIST = ( + "installation", + "overlays", + "global_overlay_cache", + "workshop_cache", +) +MAX_LOWERDIRS = 500 +MOUNT_BIN = "/bin/mount" +UMOUNT_BIN = "/bin/umount" + + +def die(msg: str) -> None: + sys.stderr.write(f"left4me-overlay: {msg}\n") + sys.exit(1) + + +def root() -> Path: + return Path(os.environ.get("LEFT4ME_ROOT") or DEFAULT_ROOT) + + +def validate_name(name: str) -> str: + if not NAME_RE.fullmatch(name): + die(f"invalid instance name: {name!r}") + return name + + +def parse_lowerdirs(env_path: Path) -> list[str]: + if not env_path.is_file(): + die(f"instance.env not found: {env_path}") + raw = None + for line in env_path.read_text().splitlines(): + if "=" not in line: + continue + key, value = line.split("=", 1) + if key.strip() == "L4D2_LOWERDIRS": + raw = value + break + if raw is None: + die(f"L4D2_LOWERDIRS not set in {env_path}") + if raw == "": + die(f"L4D2_LOWERDIRS is empty in {env_path}") + parts = raw.split(":") + if any(p == "" for p in parts): + die(f"L4D2_LOWERDIRS contains an empty entry: {raw!r}") + if len(parts) > MAX_LOWERDIRS: + die(f"L4D2_LOWERDIRS has {len(parts)} entries (cap {MAX_LOWERDIRS})") + return parts + + +def canonical_under(allowed_roots: list[Path], path: Path) -> Path: + try: + canonical = path.resolve(strict=True) + except (FileNotFoundError, RuntimeError): + die(f"path does not exist or has a symlink loop: {path}") + for r in allowed_roots: + if canonical == r or r in canonical.parents: + return canonical + die(f"path is outside the permitted roots: {path} (resolved: {canonical})") + + +_LISTXATTR = getattr(os, "listxattr", None) + + +def _entry_has_fuse_xattr(path: str) -> str | None: + if _LISTXATTR is None: + return None + try: + attrs = _LISTXATTR(path, follow_symlinks=False) + except OSError: + return None + for a in attrs: + if a.startswith("user.fuseoverlayfs."): + return a + return None + + +def assert_no_fuse_xattrs(upper: Path) -> None: + if not upper.exists() or _LISTXATTR is None: + return + for dirpath, dirnames, filenames in os.walk(upper): + for entry in (dirpath, *(os.path.join(dirpath, n) for n in dirnames), + *(os.path.join(dirpath, n) for n in filenames)): + tainted = _entry_has_fuse_xattr(entry) + if tainted: + die( + f"upperdir contains fuse-overlayfs xattr {tainted!r} on {entry}; " + "wipe upper/ and work/ before mounting" + ) + + +def exec_or_print(argv: list[str]) -> None: + if os.environ.get("LEFT4ME_OVERLAY_PRINT_ONLY") == "1": + print(" ".join(shlex.quote(a) for a in argv)) + sys.exit(0) + os.execv(argv[0], argv) + + +def cmd_mount(name: str) -> None: + name = validate_name(name) + r = root() + runtime_name_dir = (r / "runtime" / name).resolve(strict=True) + merged_for_check = (runtime_name_dir / "merged").resolve(strict=True) + + # Idempotency for unit restart cycles: if a previous start mounted + # successfully but ExecStart failed afterwards (and Restart=on-failure + # fires another cycle), the second ExecStartPre would otherwise refuse + # to mount-on-top. Short-circuit here so the second cycle just gets + # straight to ExecStart. PRINT_ONLY (test mode) bypasses this so the + # tests can exercise the full nsenter argv regardless of mount state. + if ( + os.environ.get("LEFT4ME_OVERLAY_PRINT_ONLY") != "1" + and os.path.ismount(merged_for_check) + ): + return + + instance_env = r / "instances" / name / "instance.env" + raw_lowerdirs = parse_lowerdirs(instance_env) + + allowed_roots = [(r / sub).resolve() for sub in LOWERDIR_ALLOWLIST] + canonical_lowerdirs = [str(canonical_under(allowed_roots, Path(p))) for p in raw_lowerdirs] + + upper = (runtime_name_dir / "upper").resolve(strict=True) + work = (runtime_name_dir / "work").resolve(strict=True) + merged = merged_for_check + for label, path in (("upper", upper), ("work", work), ("merged", merged)): + if path.parent != runtime_name_dir: + die(f"{label} resolved outside runtime/{name}: {path}") + + assert_no_fuse_xattrs(upper) + + options = f"lowerdir={':'.join(canonical_lowerdirs)},upperdir={upper},workdir={work}" + argv = [ + MOUNT_BIN, + "-t", "overlay", + "overlay", + "-o", options, + str(merged), + ] + exec_or_print(argv) + + +def cmd_umount(name: str) -> None: + name = validate_name(name) + r = root() + runtime_name_dir = (r / "runtime" / name).resolve(strict=True) + merged_path = runtime_name_dir / "merged" + work_inner = runtime_name_dir / "work" / "work" + + argv = [ + UMOUNT_BIN, + # Resolve only if it exists; PRINT_ONLY tests always pre-create it. + str(merged_path.resolve(strict=True) if merged_path.exists() else merged_path), + ] + + # PRINT_ONLY: emit the umount argv and exit. Tests assert exact shape + # of this dry-run; the post-umount cleanup of work_inner is a runtime + # behaviour exercised on the host, not in unit tests. + if os.environ.get("LEFT4ME_OVERLAY_PRINT_ONLY") == "1": + print(" ".join(shlex.quote(a) for a in argv)) + sys.exit(0) + + if merged_path.exists(): + merged = merged_path.resolve(strict=True) + if merged.parent != runtime_name_dir: + die(f"merged resolved outside runtime/{name}: {merged}") + # Idempotency: only umount if currently a mount point. Mirrors + # cmd_mount's symmetric check; a redundant cleanup pass — or a + # call after a partial _purge_instance — must be a no-op. + # + # No retry loop here: with the helper running in PID 1's mount + # namespace (via the unit-level `nsenter --mount=/proc/1/ns/mnt` + # in ExecStopPost), it holds no reference to the unit's + # per-service mount namespace, so the cgroup-empty → namespace + # reaped → umount-clears sequence happens without any race + # window for us to ride out. EBUSY here is a real error. + if os.path.ismount(merged): + subprocess.run(argv, check=True) + + # Kernel-overlayfs creates work_inner during mount with root:root mode + # 0/0. After unmount it's an orphan that the unit's User= (left4me) + # cannot traverse via shutil.rmtree, so reset/delete in instances.py + # blows up with EACCES on `runtime//work/work`. The helper is + # the only code path with root that knows about this directory, so + # the cleanup belongs here. Safe to nuke — the kernel re-creates it + # on the next mount. Run unconditionally — covers both "we just + # unmounted" and "previous teardown didn't finish" cases. + if work_inner.exists(): + shutil.rmtree(work_inner) + + +def main(argv: list[str]) -> None: + if len(argv) != 3 or argv[1] not in ("mount", "umount"): + sys.stderr.write("usage: left4me-overlay mount|umount \n") + sys.exit(2) + if argv[1] == "mount": + cmd_mount(argv[2]) + else: + cmd_umount(argv[2]) + + +if __name__ == "__main__": + main(sys.argv) diff --git a/bundles/left4me/files/usr/local/libexec/left4me/left4me-script-sandbox b/bundles/left4me/files/usr/local/libexec/left4me/left4me-script-sandbox new file mode 100755 index 0000000..5e6458b --- /dev/null +++ b/bundles/left4me/files/usr/local/libexec/left4me/left4me-script-sandbox @@ -0,0 +1,82 @@ +#!/bin/bash +# Privileged sandbox launcher for left4me script overlays. +# +# Invoked via sudo by the web user with two arguments: +# numeric overlay id; bind-mounts /var/lib/left4me/overlays/ +# read-write at /overlay inside the sandbox. +# absolute path to a bash file already written by the web app; +# bind-mounted read-only at /script.sh inside the sandbox. +# +# The script runs as a transient systemd .service with the full hardening +# surface: cgroup limits + walltime kill, NoNewPrivileges, ProtectSystem, +# ProtectHome, kernel-tunable / -module / -log protection, namespace +# restriction, address-family restriction, capability bounding (empty), +# seccomp filter (@system-service @network-io), MemoryDenyWriteExecute, +# LockPersonality, RestrictSUIDSGID. Network namespace is *not* restricted — +# scripts must reach the public internet to download workshop / l4d2center +# / cedapug content. PID namespace is shared with the host (no +# PrivatePID= directive in systemd); host PIDs are visible via /proc but +# not signal-able due to UID mismatch. +set -euo pipefail + +[[ $# -eq 2 ]] || { echo "usage: $0