From 5eac51a93edb95c109bcb94811b73f78246bc684 Mon Sep 17 00:00:00 2001 From: mwiegand Date: Sat, 9 May 2026 15:13:59 +0200 Subject: [PATCH] fix(deploy): wrap overlay helper with nsenter so it doesn't pin the unit's mount namespace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit systemd's `+` Exec prefix removes sandbox/credentials but does NOT detach from the unit's per-service mount namespace (created by PrivateTmp/Protect*). The Python interpreter for the helper was launched inside that namespace, and even though the helper internally nsenter'd into PID 1 for the umount syscall, the calling Python process itself never left the unit's namespace. Its existence pinned the namespace alive, which kept the slave mount tree alive, which made PID 1's umount return EBUSY for the entire duration of the helper's run. The mount became unmountable the moment the helper exited — empirically verified by polling /proc/*/ns/mnt during stop: the only PID holding the dying namespace was the helper itself. Wrap both ExecStartPre and ExecStopPost with `/usr/bin/nsenter --mount=/proc/1/ns/mnt --` so the helper Python interpreter runs in PID 1's mount namespace from the start. With the helper out of the unit's namespace, umount succeeds first try once the cgroup empties. Reset went from ~25 s with retry/lazy-fallback workarounds to ~0.5 s clean. Knock-on cleanups: - Helper drops internal nsenter for the syscalls (already in PID 1's namespace), and drops the eager-retry loop + lazy-umount fallback + inner work_inner retry (no race left to ride out). - Revert TimeoutStopSec=60s back to 15s. - Tests updated to expect the new argv shapes. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../systemd/system/left4me-server@.service | 22 +++-- .../usr/local/libexec/left4me/left4me-overlay | 84 ++++++++++++------- deploy/tests/test_deploy_artifacts.py | 31 +++++-- l4d2host/tests/test_overlay_helper.py | 30 +++---- 4 files changed, 112 insertions(+), 55 deletions(-) diff --git a/deploy/files/usr/local/lib/systemd/system/left4me-server@.service b/deploy/files/usr/local/lib/systemd/system/left4me-server@.service index e6f5d4f..80158b8 100644 --- a/deploy/files/usr/local/lib/systemd/system/left4me-server@.service +++ b/deploy/files/usr/local/lib/systemd/system/left4me-server@.service @@ -29,10 +29,22 @@ WorkingDirectory=-/var/lib/left4me/runtime/%i/merged/left4dead2 # # `+` prefix runs the helper as PID 1 (root, no sandbox). Required because # the unit has NoNewPrivileges=true, which blocks sudo's setuid escalation -# — and the helper itself needs root to nsenter into PID 1's mnt namespace -# anyway. ExecStopPost (not ExecStop) so unmount runs after the cgroup is -# cleared; ExecStop runs while srcds is still alive and would EBUSY. -ExecStartPre=+/usr/local/libexec/left4me/left4me-overlay mount %i +# — and the helper itself needs root for the mount/umount syscalls. +# +# `nsenter --mount=/proc/1/ns/mnt --` runs the helper Python interpreter +# in PID 1's mount namespace. Without this, the `+` prefix removes the +# sandbox/credentials but does NOT detach from the unit's per-service +# mount namespace (created by PrivateTmp/Protect*) — so the helper +# process itself would hold a reference to that namespace, keeping the +# slave-mount tree alive after the cgroup empties, and umount in PID 1 +# would return EBUSY for as long as the helper ran. Putting nsenter at +# the unit-level (as opposed to inside the helper, where only the +# umount syscall escaped) is what actually frees the namespace. Once +# the helper is in PID 1's namespace, ExecStopPost's umount succeeds +# on the first try with no retry/race window. ExecStopPost (not +# ExecStop) so unmount runs after the cgroup is cleared; ExecStop runs +# while srcds is still alive and would EBUSY. +ExecStartPre=+/usr/bin/nsenter --mount=/proc/1/ns/mnt -- /usr/local/libexec/left4me/left4me-overlay mount %i # Run from the merged overlay, NOT installation/. srcds_run is a shell # script that `cd`s to its own dirname before exec'ing srcds_linux, so the # binary's path determines where the engine reads gameinfo.txt and addons @@ -40,7 +52,7 @@ ExecStartPre=+/usr/local/libexec/left4me/left4me-overlay mount %i # would resolve everything against the lower layer and never see overlay- # provided plugins (Metamod/SourceMod) or cfgs (zonemod, confogl). ExecStart=/var/lib/left4me/runtime/%i/merged/srcds_run -game left4dead2 +hostport ${L4D2_PORT} $L4D2_ARGS -ExecStopPost=+/usr/local/libexec/left4me/left4me-overlay umount %i +ExecStopPost=+/usr/bin/nsenter --mount=/proc/1/ns/mnt -- /usr/local/libexec/left4me/left4me-overlay umount %i Restart=on-failure RestartSec=5 diff --git a/deploy/files/usr/local/libexec/left4me/left4me-overlay b/deploy/files/usr/local/libexec/left4me/left4me-overlay index ba27134..ccbb4c0 100644 --- a/deploy/files/usr/local/libexec/left4me/left4me-overlay +++ b/deploy/files/usr/local/libexec/left4me/left4me-overlay @@ -1,10 +1,19 @@ #!/usr/bin/python3 """Privileged overlay mount helper for left4me. -Invoked via sudo by the left4me runtime user. Validates inputs strictly, -then enters PID 1's mount namespace via nsenter to perform the actual -mount/umount syscall, so the resulting mount lives in the host namespace -and is visible to the systemd-managed gameserver units. +Invoked from the systemd unit's ExecStartPre / ExecStopPost via +`+/usr/bin/nsenter --mount=/proc/1/ns/mnt -- …`. The unit-level +nsenter is what makes this work: it runs the helper Python interpreter +inside PID 1's mount namespace. Without it, the `+` Exec prefix +removes the sandbox/credentials but does NOT detach from the unit's +per-service mount namespace, and the helper process itself would pin +that namespace alive — turning every umount into a multi-second EBUSY +race with the kernel's deferred namespace cleanup. With the unit-level +nsenter the helper has no such reference and umount succeeds first try. + +Validates inputs strictly, then performs `mount -t overlay` / +`umount` directly — no internal nsenter, since the helper is already +running where the syscalls need to take effect. Verbs: mount Reads ${LEFT4ME_ROOT}/instances//instance.env @@ -12,7 +21,8 @@ Verbs: under one of installation/overlays/workshop_cache/ global_overlay_cache, then mounts the kernel overlay at runtime//merged. - umount Unmounts runtime//merged. + umount Unmounts runtime//merged and cleans up the + kernel-overlayfs `work/work` orphan. Set LEFT4ME_OVERLAY_PRINT_ONLY=1 to print the would-be argv (one line, shell-quoted) and exit 0 instead of execv. Used by tests. @@ -21,6 +31,8 @@ shell-quoted) and exit 0 instead of execv. Used by tests. import os import re import shlex +import shutil +import subprocess import sys from pathlib import Path @@ -33,7 +45,6 @@ LOWERDIR_ALLOWLIST = ( "workshop_cache", ) MAX_LOWERDIRS = 500 -NSENTER = "/usr/bin/nsenter" MOUNT_BIN = "/bin/mount" UMOUNT_BIN = "/bin/umount" @@ -159,9 +170,6 @@ def cmd_mount(name: str) -> None: options = f"lowerdir={':'.join(canonical_lowerdirs)},upperdir={upper},workdir={work}" argv = [ - NSENTER, - "--mount=/proc/1/ns/mnt", - "--", MOUNT_BIN, "-t", "overlay", "overlay", @@ -175,29 +183,49 @@ def cmd_umount(name: str) -> None: name = validate_name(name) r = root() runtime_name_dir = (r / "runtime" / name).resolve(strict=True) - merged = (runtime_name_dir / "merged").resolve(strict=True) - if merged.parent != runtime_name_dir: - die(f"merged resolved outside runtime/{name}: {merged}") - - # Idempotency: if merged isn't a mount point right now, we have nothing - # to do. Mirrors cmd_mount's symmetric check. ExecStopPost on the unit - # is the one canonical caller, but a manual `systemctl reset-failed` - # cycle or a redundant cleanup pass should still be a no-op. PRINT_ONLY - # bypasses for the same reason as cmd_mount above. - if ( - os.environ.get("LEFT4ME_OVERLAY_PRINT_ONLY") != "1" - and not os.path.ismount(merged) - ): - return + merged_path = runtime_name_dir / "merged" + work_inner = runtime_name_dir / "work" / "work" argv = [ - NSENTER, - "--mount=/proc/1/ns/mnt", - "--", UMOUNT_BIN, - str(merged), + # Resolve only if it exists; PRINT_ONLY tests always pre-create it. + str(merged_path.resolve(strict=True) if merged_path.exists() else merged_path), ] - exec_or_print(argv) + + # PRINT_ONLY: emit the umount argv and exit. Tests assert exact shape + # of this dry-run; the post-umount cleanup of work_inner is a runtime + # behaviour exercised on the host, not in unit tests. + if os.environ.get("LEFT4ME_OVERLAY_PRINT_ONLY") == "1": + print(" ".join(shlex.quote(a) for a in argv)) + sys.exit(0) + + if merged_path.exists(): + merged = merged_path.resolve(strict=True) + if merged.parent != runtime_name_dir: + die(f"merged resolved outside runtime/{name}: {merged}") + # Idempotency: only umount if currently a mount point. Mirrors + # cmd_mount's symmetric check; a redundant cleanup pass — or a + # call after a partial _purge_instance — must be a no-op. + # + # No retry loop here: with the helper running in PID 1's mount + # namespace (via the unit-level `nsenter --mount=/proc/1/ns/mnt` + # in ExecStopPost), it holds no reference to the unit's + # per-service mount namespace, so the cgroup-empty → namespace + # reaped → umount-clears sequence happens without any race + # window for us to ride out. EBUSY here is a real error. + if os.path.ismount(merged): + subprocess.run(argv, check=True) + + # Kernel-overlayfs creates work_inner during mount with root:root mode + # 0/0. After unmount it's an orphan that the unit's User= (left4me) + # cannot traverse via shutil.rmtree, so reset/delete in instances.py + # blows up with EACCES on `runtime//work/work`. The helper is + # the only code path with root that knows about this directory, so + # the cleanup belongs here. Safe to nuke — the kernel re-creates it + # on the next mount. Run unconditionally — covers both "we just + # unmounted" and "previous teardown didn't finish" cases. + if work_inner.exists(): + shutil.rmtree(work_inner) def main(argv: list[str]) -> None: diff --git a/deploy/tests/test_deploy_artifacts.py b/deploy/tests/test_deploy_artifacts.py index 09fe5ce..c8c2928 100644 --- a/deploy/tests/test_deploy_artifacts.py +++ b/deploy/tests/test_deploy_artifacts.py @@ -90,13 +90,20 @@ def test_server_unit_mounts_overlay_via_exec_start_pre(): chance to run start_instance's pre-start mount. The unit itself must re-mount the overlay so reboots are transparent. Pairs with the helper's idempotency check (test_overlay_helper_mount_is_idempotent_when_mounted). + + The unit-level `nsenter --mount=/proc/1/ns/mnt --` is what makes + umount fast: without it, the helper Python process would inherit + the unit's per-service mount namespace and pin it alive, blocking + PID 1's umount until the helper exited. Wrapping with nsenter at + the Exec line puts the helper itself in PID 1's namespace. """ unit = SERVER_UNIT.read_text() - # `+` prefix: ExecStartPre runs as PID 1 (root, no sandbox). Required - # because the unit has NoNewPrivileges=true, which blocks sudo's setuid - # escalation — and the helper needs root for nsenter anyway. + # `+` prefix: runs as PID 1 (root, no sandbox). Required because + # the unit has NoNewPrivileges=true, which blocks sudo's setuid + # escalation — and the helper needs root for the mount syscall. assert ( - "ExecStartPre=+/usr/local/libexec/left4me/left4me-overlay mount %i" + "ExecStartPre=+/usr/bin/nsenter --mount=/proc/1/ns/mnt -- " + "/usr/local/libexec/left4me/left4me-overlay mount %i" in unit ) # Bound the restart loop; without these, a CHDIR-failure (or any other @@ -108,12 +115,17 @@ def test_server_unit_mounts_overlay_via_exec_start_pre(): def test_server_unit_unmounts_overlay_via_exec_stop_post(): """Single source of truth for unmount, mirroring the mount path. ExecStopPost (not ExecStop) so it runs after srcds has fully exited - and the cgroup is cleared — otherwise the open files in merged/ would - EBUSY the umount syscall. + and the cgroup is cleared. + + Same nsenter-at-Exec-line wrapping as ExecStartPre — without it, + the helper process would itself hold a reference to the unit's + per-service mount namespace, and umount in PID 1 would loop on + EBUSY until the helper gave up. With it, umount succeeds first try. """ unit = SERVER_UNIT.read_text() assert ( - "ExecStopPost=+/usr/local/libexec/left4me/left4me-overlay umount %i" + "ExecStopPost=+/usr/bin/nsenter --mount=/proc/1/ns/mnt -- " + "/usr/local/libexec/left4me/left4me-overlay umount %i" in unit ) @@ -157,7 +169,10 @@ def test_server_unit_contains_perf_baseline_directives(): # Plenty of fds for plugin-heavy setups. assert "LimitNOFILE=65536" in unit - # srcds clean shutdown via SIGINT, with time to flush. + # srcds clean shutdown via SIGINT, with time to flush. With the + # helper running in PID 1's mount namespace (via the unit-level + # nsenter on ExecStopPost), umount has no race window and the + # default 15 s is plenty for the whole stop transition. assert "KillSignal=SIGINT" in unit assert "TimeoutStopSec=15s" in unit diff --git a/l4d2host/tests/test_overlay_helper.py b/l4d2host/tests/test_overlay_helper.py index 0e3977e..a098daf 100644 --- a/l4d2host/tests/test_overlay_helper.py +++ b/l4d2host/tests/test_overlay_helper.py @@ -52,28 +52,33 @@ def _run(args: list[str], root: Path, extra_env: dict[str, str] | None = None) - ) -def test_mount_prints_expected_nsenter_command(tmp_path: Path) -> None: +def test_mount_prints_expected_command(tmp_path: Path) -> None: + """The helper invokes /bin/mount directly. nsenter into PID 1's + mount namespace happens at the systemd Exec line (see the unit + file), so by the time the helper runs, the syscall already lands + in the right namespace. + """ _setup_instance(tmp_path) result = _run(["mount", "alpha"], tmp_path) assert result.returncode == 0, result.stderr parts = shlex.split(result.stdout.strip()) - assert parts[0] == "/usr/bin/nsenter" - assert parts[1] == "--mount=/proc/1/ns/mnt" - assert parts[2] == "--" - assert parts[3] == "/bin/mount" - assert parts[4:6] == ["-t", "overlay"] - assert parts[6] == "overlay" - assert parts[7] == "-o" - options = parts[8] + assert parts[0] == "/bin/mount" + assert parts[1:3] == ["-t", "overlay"] + assert parts[3] == "overlay" + assert parts[4] == "-o" + options = parts[5] assert f"upperdir={tmp_path}/runtime/alpha/upper" in options assert f"workdir={tmp_path}/runtime/alpha/work" in options assert f"lowerdir={tmp_path}/overlays/workshop:{tmp_path}/installation" in options - assert parts[9] == str(tmp_path / "runtime" / "alpha" / "merged") + assert parts[6] == str(tmp_path / "runtime" / "alpha" / "merged") -def test_umount_prints_expected_nsenter_command(tmp_path: Path) -> None: +def test_umount_prints_expected_command(tmp_path: Path) -> None: + """Same as the mount path: helper invokes /bin/umount directly, + relying on the unit-level nsenter to put it in PID 1's namespace. + """ _setup_instance(tmp_path) result = _run(["umount", "alpha"], tmp_path) @@ -81,9 +86,6 @@ def test_umount_prints_expected_nsenter_command(tmp_path: Path) -> None: assert result.returncode == 0, result.stderr parts = shlex.split(result.stdout.strip()) assert parts == [ - "/usr/bin/nsenter", - "--mount=/proc/1/ns/mnt", - "--", "/bin/umount", str(tmp_path / "runtime" / "alpha" / "merged"), ]