fix(l4d2-host): mount overlay via ExecStartPre so enabled units boot cleanly

The lifecycle change to systemctl enable --now (commit 8552c55) made
units auto-start at boot. But the kernel-overlayfs mount is volatile
(reboot kills it), and the web app's start_instance only re-mounts in
response to a UI click. Result: at boot, systemd starts the unit, finds
empty merged/, CHDIR fails, Restart=on-failure spins forever (counter
hit 65 on ckn before this fix landed).

Fix:
- Unit gets `ExecStartPre=/usr/bin/sudo -n .../left4me-overlay mount %i`
  so the overlay is established before the main process starts.
- Helper is now idempotent: if merged is already a mount point, exit 0.
  Required because Restart=on-failure re-runs ExecStartPre on each
  cycle, and the web-app's start_instance also calls the helper, so
  both paths would otherwise collide on "already mounted".
- StartLimitBurst=5 + StartLimitIntervalSec=60s caps the restart loop
  instead of letting it spin indefinitely on a fundamental failure.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
mwiegand 2026-05-09 12:47:20 +02:00
parent b62fc08127
commit 519567e156
No known key found for this signature in database
3 changed files with 52 additions and 2 deletions

View file

@ -10,9 +10,18 @@ Group=left4me
EnvironmentFile=/etc/left4me/host.env EnvironmentFile=/etc/left4me/host.env
EnvironmentFile=/var/lib/left4me/instances/%i/instance.env EnvironmentFile=/var/lib/left4me/instances/%i/instance.env
WorkingDirectory=/var/lib/left4me/runtime/%i/merged/left4dead2 WorkingDirectory=/var/lib/left4me/runtime/%i/merged/left4dead2
# At boot the kernel-overlayfs mount is gone (mounts are volatile); the
# web app's start_instance also pre-mounts but doesn't run on auto-start.
# The helper is idempotent — a no-op if already mounted by the web app.
ExecStartPre=/usr/bin/sudo -n /usr/local/libexec/left4me/left4me-overlay mount %i
ExecStart=/var/lib/left4me/installation/srcds_run -game left4dead2 +hostport ${L4D2_PORT} $L4D2_ARGS ExecStart=/var/lib/left4me/installation/srcds_run -game left4dead2 +hostport ${L4D2_PORT} $L4D2_ARGS
Restart=on-failure Restart=on-failure
RestartSec=5 RestartSec=5
# Bound the restart loop. Without these, a persistent ExecStartPre or
# ExecStart failure spins indefinitely (default systemd has no cap when
# Restart= is explicitly set without StartLimit*).
StartLimitBurst=5
StartLimitIntervalSec=60s
# Resource control baseline — see docs/superpowers/specs/2026-05-09-l4d2-server-host-perf-baseline-design.md # Resource control baseline — see docs/superpowers/specs/2026-05-09-l4d2-server-host-perf-baseline-design.md
Slice=l4d2-game.slice Slice=l4d2-game.slice

View file

@ -127,16 +127,30 @@ def exec_or_print(argv: list[str]) -> None:
def cmd_mount(name: str) -> None: def cmd_mount(name: str) -> None:
name = validate_name(name) name = validate_name(name)
r = root() r = root()
runtime_name_dir = (r / "runtime" / name).resolve(strict=True)
merged_for_check = (runtime_name_dir / "merged").resolve(strict=True)
# Idempotency for unit restart cycles: if a previous start mounted
# successfully but ExecStart failed afterwards (and Restart=on-failure
# fires another cycle), the second ExecStartPre would otherwise refuse
# to mount-on-top. Short-circuit here so the second cycle just gets
# straight to ExecStart. This also handles the dual-path case where
# both the web app's start_instance and the unit's ExecStartPre call
# the helper.
if os.path.ismount(merged_for_check):
if os.environ.get("LEFT4ME_OVERLAY_PRINT_ONLY") == "1":
print("ALREADY_MOUNTED")
return
instance_env = r / "instances" / name / "instance.env" instance_env = r / "instances" / name / "instance.env"
raw_lowerdirs = parse_lowerdirs(instance_env) raw_lowerdirs = parse_lowerdirs(instance_env)
allowed_roots = [(r / sub).resolve() for sub in LOWERDIR_ALLOWLIST] allowed_roots = [(r / sub).resolve() for sub in LOWERDIR_ALLOWLIST]
canonical_lowerdirs = [str(canonical_under(allowed_roots, Path(p))) for p in raw_lowerdirs] canonical_lowerdirs = [str(canonical_under(allowed_roots, Path(p))) for p in raw_lowerdirs]
runtime_name_dir = (r / "runtime" / name).resolve(strict=True)
upper = (runtime_name_dir / "upper").resolve(strict=True) upper = (runtime_name_dir / "upper").resolve(strict=True)
work = (runtime_name_dir / "work").resolve(strict=True) work = (runtime_name_dir / "work").resolve(strict=True)
merged = (runtime_name_dir / "merged").resolve(strict=True) merged = merged_for_check
for label, path in (("upper", upper), ("work", work), ("merged", merged)): for label, path in (("upper", upper), ("work", work), ("merged", merged)):
if path.parent != runtime_name_dir: if path.parent != runtime_name_dir:
die(f"{label} resolved outside runtime/{name}: {path}") die(f"{label} resolved outside runtime/{name}: {path}")

View file

@ -78,6 +78,33 @@ def test_server_unit_contains_required_runtime_contract():
assert "LockPersonality=true" in unit assert "LockPersonality=true" in unit
def test_server_unit_mounts_overlay_via_exec_start_pre():
"""At boot, systemd auto-starts enabled units before the web app gets a
chance to run start_instance's pre-start mount. The unit itself must
re-mount the overlay so reboots are transparent. Pairs with the helper's
idempotency check (test_overlay_helper_mount_is_idempotent_when_mounted).
"""
unit = SERVER_UNIT.read_text()
assert (
"ExecStartPre=/usr/bin/sudo -n /usr/local/libexec/left4me/left4me-overlay mount %i"
in unit
)
# Bound the restart loop; without these, a CHDIR-failure (or any other
# pre-start error) spins indefinitely.
assert "StartLimitBurst=5" in unit
assert "StartLimitIntervalSec=60s" in unit
def test_overlay_helper_mount_is_idempotent_when_already_mounted():
"""ExecStartPre runs on every Restart=on-failure cycle. If a previous
start mounted successfully but ExecStart failed afterwards, the next
ExecStartPre would re-mount on top -- which fails. The helper must
short-circuit when merged is already a mount point.
"""
text = OVERLAY_HELPER.read_text()
assert "os.path.ismount" in text
def test_server_unit_contains_perf_baseline_directives(): def test_server_unit_contains_perf_baseline_directives():
unit = SERVER_UNIT.read_text() unit = SERVER_UNIT.read_text()