diff --git a/deploy/files/usr/local/lib/systemd/system/left4me-server@.service b/deploy/files/usr/local/lib/systemd/system/left4me-server@.service index e6f5d4f..80158b8 100644 --- a/deploy/files/usr/local/lib/systemd/system/left4me-server@.service +++ b/deploy/files/usr/local/lib/systemd/system/left4me-server@.service @@ -29,10 +29,22 @@ WorkingDirectory=-/var/lib/left4me/runtime/%i/merged/left4dead2 # # `+` prefix runs the helper as PID 1 (root, no sandbox). Required because # the unit has NoNewPrivileges=true, which blocks sudo's setuid escalation -# — and the helper itself needs root to nsenter into PID 1's mnt namespace -# anyway. ExecStopPost (not ExecStop) so unmount runs after the cgroup is -# cleared; ExecStop runs while srcds is still alive and would EBUSY. -ExecStartPre=+/usr/local/libexec/left4me/left4me-overlay mount %i +# — and the helper itself needs root for the mount/umount syscalls. +# +# `nsenter --mount=/proc/1/ns/mnt --` runs the helper Python interpreter +# in PID 1's mount namespace. Without this, the `+` prefix removes the +# sandbox/credentials but does NOT detach from the unit's per-service +# mount namespace (created by PrivateTmp/Protect*) — so the helper +# process itself would hold a reference to that namespace, keeping the +# slave-mount tree alive after the cgroup empties, and umount in PID 1 +# would return EBUSY for as long as the helper ran. Putting nsenter at +# the unit-level (as opposed to inside the helper, where only the +# umount syscall escaped) is what actually frees the namespace. Once +# the helper is in PID 1's namespace, ExecStopPost's umount succeeds +# on the first try with no retry/race window. ExecStopPost (not +# ExecStop) so unmount runs after the cgroup is cleared; ExecStop runs +# while srcds is still alive and would EBUSY. +ExecStartPre=+/usr/bin/nsenter --mount=/proc/1/ns/mnt -- /usr/local/libexec/left4me/left4me-overlay mount %i # Run from the merged overlay, NOT installation/. srcds_run is a shell # script that `cd`s to its own dirname before exec'ing srcds_linux, so the # binary's path determines where the engine reads gameinfo.txt and addons @@ -40,7 +52,7 @@ ExecStartPre=+/usr/local/libexec/left4me/left4me-overlay mount %i # would resolve everything against the lower layer and never see overlay- # provided plugins (Metamod/SourceMod) or cfgs (zonemod, confogl). ExecStart=/var/lib/left4me/runtime/%i/merged/srcds_run -game left4dead2 +hostport ${L4D2_PORT} $L4D2_ARGS -ExecStopPost=+/usr/local/libexec/left4me/left4me-overlay umount %i +ExecStopPost=+/usr/bin/nsenter --mount=/proc/1/ns/mnt -- /usr/local/libexec/left4me/left4me-overlay umount %i Restart=on-failure RestartSec=5 diff --git a/deploy/files/usr/local/libexec/left4me/left4me-overlay b/deploy/files/usr/local/libexec/left4me/left4me-overlay index ba27134..ccbb4c0 100644 --- a/deploy/files/usr/local/libexec/left4me/left4me-overlay +++ b/deploy/files/usr/local/libexec/left4me/left4me-overlay @@ -1,10 +1,19 @@ #!/usr/bin/python3 """Privileged overlay mount helper for left4me. -Invoked via sudo by the left4me runtime user. Validates inputs strictly, -then enters PID 1's mount namespace via nsenter to perform the actual -mount/umount syscall, so the resulting mount lives in the host namespace -and is visible to the systemd-managed gameserver units. +Invoked from the systemd unit's ExecStartPre / ExecStopPost via +`+/usr/bin/nsenter --mount=/proc/1/ns/mnt -- …`. The unit-level +nsenter is what makes this work: it runs the helper Python interpreter +inside PID 1's mount namespace. Without it, the `+` Exec prefix +removes the sandbox/credentials but does NOT detach from the unit's +per-service mount namespace, and the helper process itself would pin +that namespace alive — turning every umount into a multi-second EBUSY +race with the kernel's deferred namespace cleanup. With the unit-level +nsenter the helper has no such reference and umount succeeds first try. + +Validates inputs strictly, then performs `mount -t overlay` / +`umount` directly — no internal nsenter, since the helper is already +running where the syscalls need to take effect. Verbs: mount Reads ${LEFT4ME_ROOT}/instances//instance.env @@ -12,7 +21,8 @@ Verbs: under one of installation/overlays/workshop_cache/ global_overlay_cache, then mounts the kernel overlay at runtime//merged. - umount Unmounts runtime//merged. + umount Unmounts runtime//merged and cleans up the + kernel-overlayfs `work/work` orphan. Set LEFT4ME_OVERLAY_PRINT_ONLY=1 to print the would-be argv (one line, shell-quoted) and exit 0 instead of execv. Used by tests. @@ -21,6 +31,8 @@ shell-quoted) and exit 0 instead of execv. Used by tests. import os import re import shlex +import shutil +import subprocess import sys from pathlib import Path @@ -33,7 +45,6 @@ LOWERDIR_ALLOWLIST = ( "workshop_cache", ) MAX_LOWERDIRS = 500 -NSENTER = "/usr/bin/nsenter" MOUNT_BIN = "/bin/mount" UMOUNT_BIN = "/bin/umount" @@ -159,9 +170,6 @@ def cmd_mount(name: str) -> None: options = f"lowerdir={':'.join(canonical_lowerdirs)},upperdir={upper},workdir={work}" argv = [ - NSENTER, - "--mount=/proc/1/ns/mnt", - "--", MOUNT_BIN, "-t", "overlay", "overlay", @@ -175,29 +183,49 @@ def cmd_umount(name: str) -> None: name = validate_name(name) r = root() runtime_name_dir = (r / "runtime" / name).resolve(strict=True) - merged = (runtime_name_dir / "merged").resolve(strict=True) - if merged.parent != runtime_name_dir: - die(f"merged resolved outside runtime/{name}: {merged}") - - # Idempotency: if merged isn't a mount point right now, we have nothing - # to do. Mirrors cmd_mount's symmetric check. ExecStopPost on the unit - # is the one canonical caller, but a manual `systemctl reset-failed` - # cycle or a redundant cleanup pass should still be a no-op. PRINT_ONLY - # bypasses for the same reason as cmd_mount above. - if ( - os.environ.get("LEFT4ME_OVERLAY_PRINT_ONLY") != "1" - and not os.path.ismount(merged) - ): - return + merged_path = runtime_name_dir / "merged" + work_inner = runtime_name_dir / "work" / "work" argv = [ - NSENTER, - "--mount=/proc/1/ns/mnt", - "--", UMOUNT_BIN, - str(merged), + # Resolve only if it exists; PRINT_ONLY tests always pre-create it. + str(merged_path.resolve(strict=True) if merged_path.exists() else merged_path), ] - exec_or_print(argv) + + # PRINT_ONLY: emit the umount argv and exit. Tests assert exact shape + # of this dry-run; the post-umount cleanup of work_inner is a runtime + # behaviour exercised on the host, not in unit tests. + if os.environ.get("LEFT4ME_OVERLAY_PRINT_ONLY") == "1": + print(" ".join(shlex.quote(a) for a in argv)) + sys.exit(0) + + if merged_path.exists(): + merged = merged_path.resolve(strict=True) + if merged.parent != runtime_name_dir: + die(f"merged resolved outside runtime/{name}: {merged}") + # Idempotency: only umount if currently a mount point. Mirrors + # cmd_mount's symmetric check; a redundant cleanup pass — or a + # call after a partial _purge_instance — must be a no-op. + # + # No retry loop here: with the helper running in PID 1's mount + # namespace (via the unit-level `nsenter --mount=/proc/1/ns/mnt` + # in ExecStopPost), it holds no reference to the unit's + # per-service mount namespace, so the cgroup-empty → namespace + # reaped → umount-clears sequence happens without any race + # window for us to ride out. EBUSY here is a real error. + if os.path.ismount(merged): + subprocess.run(argv, check=True) + + # Kernel-overlayfs creates work_inner during mount with root:root mode + # 0/0. After unmount it's an orphan that the unit's User= (left4me) + # cannot traverse via shutil.rmtree, so reset/delete in instances.py + # blows up with EACCES on `runtime//work/work`. The helper is + # the only code path with root that knows about this directory, so + # the cleanup belongs here. Safe to nuke — the kernel re-creates it + # on the next mount. Run unconditionally — covers both "we just + # unmounted" and "previous teardown didn't finish" cases. + if work_inner.exists(): + shutil.rmtree(work_inner) def main(argv: list[str]) -> None: diff --git a/deploy/tests/test_deploy_artifacts.py b/deploy/tests/test_deploy_artifacts.py index 09fe5ce..c8c2928 100644 --- a/deploy/tests/test_deploy_artifacts.py +++ b/deploy/tests/test_deploy_artifacts.py @@ -90,13 +90,20 @@ def test_server_unit_mounts_overlay_via_exec_start_pre(): chance to run start_instance's pre-start mount. The unit itself must re-mount the overlay so reboots are transparent. Pairs with the helper's idempotency check (test_overlay_helper_mount_is_idempotent_when_mounted). + + The unit-level `nsenter --mount=/proc/1/ns/mnt --` is what makes + umount fast: without it, the helper Python process would inherit + the unit's per-service mount namespace and pin it alive, blocking + PID 1's umount until the helper exited. Wrapping with nsenter at + the Exec line puts the helper itself in PID 1's namespace. """ unit = SERVER_UNIT.read_text() - # `+` prefix: ExecStartPre runs as PID 1 (root, no sandbox). Required - # because the unit has NoNewPrivileges=true, which blocks sudo's setuid - # escalation — and the helper needs root for nsenter anyway. + # `+` prefix: runs as PID 1 (root, no sandbox). Required because + # the unit has NoNewPrivileges=true, which blocks sudo's setuid + # escalation — and the helper needs root for the mount syscall. assert ( - "ExecStartPre=+/usr/local/libexec/left4me/left4me-overlay mount %i" + "ExecStartPre=+/usr/bin/nsenter --mount=/proc/1/ns/mnt -- " + "/usr/local/libexec/left4me/left4me-overlay mount %i" in unit ) # Bound the restart loop; without these, a CHDIR-failure (or any other @@ -108,12 +115,17 @@ def test_server_unit_mounts_overlay_via_exec_start_pre(): def test_server_unit_unmounts_overlay_via_exec_stop_post(): """Single source of truth for unmount, mirroring the mount path. ExecStopPost (not ExecStop) so it runs after srcds has fully exited - and the cgroup is cleared — otherwise the open files in merged/ would - EBUSY the umount syscall. + and the cgroup is cleared. + + Same nsenter-at-Exec-line wrapping as ExecStartPre — without it, + the helper process would itself hold a reference to the unit's + per-service mount namespace, and umount in PID 1 would loop on + EBUSY until the helper gave up. With it, umount succeeds first try. """ unit = SERVER_UNIT.read_text() assert ( - "ExecStopPost=+/usr/local/libexec/left4me/left4me-overlay umount %i" + "ExecStopPost=+/usr/bin/nsenter --mount=/proc/1/ns/mnt -- " + "/usr/local/libexec/left4me/left4me-overlay umount %i" in unit ) @@ -157,7 +169,10 @@ def test_server_unit_contains_perf_baseline_directives(): # Plenty of fds for plugin-heavy setups. assert "LimitNOFILE=65536" in unit - # srcds clean shutdown via SIGINT, with time to flush. + # srcds clean shutdown via SIGINT, with time to flush. With the + # helper running in PID 1's mount namespace (via the unit-level + # nsenter on ExecStopPost), umount has no race window and the + # default 15 s is plenty for the whole stop transition. assert "KillSignal=SIGINT" in unit assert "TimeoutStopSec=15s" in unit diff --git a/l4d2host/tests/test_overlay_helper.py b/l4d2host/tests/test_overlay_helper.py index 0e3977e..a098daf 100644 --- a/l4d2host/tests/test_overlay_helper.py +++ b/l4d2host/tests/test_overlay_helper.py @@ -52,28 +52,33 @@ def _run(args: list[str], root: Path, extra_env: dict[str, str] | None = None) - ) -def test_mount_prints_expected_nsenter_command(tmp_path: Path) -> None: +def test_mount_prints_expected_command(tmp_path: Path) -> None: + """The helper invokes /bin/mount directly. nsenter into PID 1's + mount namespace happens at the systemd Exec line (see the unit + file), so by the time the helper runs, the syscall already lands + in the right namespace. + """ _setup_instance(tmp_path) result = _run(["mount", "alpha"], tmp_path) assert result.returncode == 0, result.stderr parts = shlex.split(result.stdout.strip()) - assert parts[0] == "/usr/bin/nsenter" - assert parts[1] == "--mount=/proc/1/ns/mnt" - assert parts[2] == "--" - assert parts[3] == "/bin/mount" - assert parts[4:6] == ["-t", "overlay"] - assert parts[6] == "overlay" - assert parts[7] == "-o" - options = parts[8] + assert parts[0] == "/bin/mount" + assert parts[1:3] == ["-t", "overlay"] + assert parts[3] == "overlay" + assert parts[4] == "-o" + options = parts[5] assert f"upperdir={tmp_path}/runtime/alpha/upper" in options assert f"workdir={tmp_path}/runtime/alpha/work" in options assert f"lowerdir={tmp_path}/overlays/workshop:{tmp_path}/installation" in options - assert parts[9] == str(tmp_path / "runtime" / "alpha" / "merged") + assert parts[6] == str(tmp_path / "runtime" / "alpha" / "merged") -def test_umount_prints_expected_nsenter_command(tmp_path: Path) -> None: +def test_umount_prints_expected_command(tmp_path: Path) -> None: + """Same as the mount path: helper invokes /bin/umount directly, + relying on the unit-level nsenter to put it in PID 1's namespace. + """ _setup_instance(tmp_path) result = _run(["umount", "alpha"], tmp_path) @@ -81,9 +86,6 @@ def test_umount_prints_expected_nsenter_command(tmp_path: Path) -> None: assert result.returncode == 0, result.stderr parts = shlex.split(result.stdout.strip()) assert parts == [ - "/usr/bin/nsenter", - "--mount=/proc/1/ns/mnt", - "--", "/bin/umount", str(tmp_path / "runtime" / "alpha" / "merged"), ]