fix(l4d2-host): mount overlay via ExecStartPre so enabled units boot cleanly
The lifecycle change to systemctl enable --now (commit 8552c55) made
units auto-start at boot. But the kernel-overlayfs mount is volatile
(reboot kills it), and the web app's start_instance only re-mounts in
response to a UI click. Result: at boot, systemd starts the unit, finds
empty merged/, CHDIR fails, Restart=on-failure spins forever (counter
hit 65 on ckn before this fix landed).
Fix:
- Unit gets `ExecStartPre=/usr/bin/sudo -n .../left4me-overlay mount %i`
so the overlay is established before the main process starts.
- Helper is now idempotent: if merged is already a mount point, exit 0.
Required because Restart=on-failure re-runs ExecStartPre on each
cycle, and the web-app's start_instance also calls the helper, so
both paths would otherwise collide on "already mounted".
- StartLimitBurst=5 + StartLimitIntervalSec=60s caps the restart loop
instead of letting it spin indefinitely on a fundamental failure.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
b62fc08127
commit
519567e156
3 changed files with 52 additions and 2 deletions
|
|
@ -10,9 +10,18 @@ Group=left4me
|
||||||
EnvironmentFile=/etc/left4me/host.env
|
EnvironmentFile=/etc/left4me/host.env
|
||||||
EnvironmentFile=/var/lib/left4me/instances/%i/instance.env
|
EnvironmentFile=/var/lib/left4me/instances/%i/instance.env
|
||||||
WorkingDirectory=/var/lib/left4me/runtime/%i/merged/left4dead2
|
WorkingDirectory=/var/lib/left4me/runtime/%i/merged/left4dead2
|
||||||
|
# At boot the kernel-overlayfs mount is gone (mounts are volatile); the
|
||||||
|
# web app's start_instance also pre-mounts but doesn't run on auto-start.
|
||||||
|
# The helper is idempotent — a no-op if already mounted by the web app.
|
||||||
|
ExecStartPre=/usr/bin/sudo -n /usr/local/libexec/left4me/left4me-overlay mount %i
|
||||||
ExecStart=/var/lib/left4me/installation/srcds_run -game left4dead2 +hostport ${L4D2_PORT} $L4D2_ARGS
|
ExecStart=/var/lib/left4me/installation/srcds_run -game left4dead2 +hostport ${L4D2_PORT} $L4D2_ARGS
|
||||||
Restart=on-failure
|
Restart=on-failure
|
||||||
RestartSec=5
|
RestartSec=5
|
||||||
|
# Bound the restart loop. Without these, a persistent ExecStartPre or
|
||||||
|
# ExecStart failure spins indefinitely (default systemd has no cap when
|
||||||
|
# Restart= is explicitly set without StartLimit*).
|
||||||
|
StartLimitBurst=5
|
||||||
|
StartLimitIntervalSec=60s
|
||||||
|
|
||||||
# Resource control baseline — see docs/superpowers/specs/2026-05-09-l4d2-server-host-perf-baseline-design.md
|
# Resource control baseline — see docs/superpowers/specs/2026-05-09-l4d2-server-host-perf-baseline-design.md
|
||||||
Slice=l4d2-game.slice
|
Slice=l4d2-game.slice
|
||||||
|
|
|
||||||
|
|
@ -127,16 +127,30 @@ def exec_or_print(argv: list[str]) -> None:
|
||||||
def cmd_mount(name: str) -> None:
|
def cmd_mount(name: str) -> None:
|
||||||
name = validate_name(name)
|
name = validate_name(name)
|
||||||
r = root()
|
r = root()
|
||||||
|
runtime_name_dir = (r / "runtime" / name).resolve(strict=True)
|
||||||
|
merged_for_check = (runtime_name_dir / "merged").resolve(strict=True)
|
||||||
|
|
||||||
|
# Idempotency for unit restart cycles: if a previous start mounted
|
||||||
|
# successfully but ExecStart failed afterwards (and Restart=on-failure
|
||||||
|
# fires another cycle), the second ExecStartPre would otherwise refuse
|
||||||
|
# to mount-on-top. Short-circuit here so the second cycle just gets
|
||||||
|
# straight to ExecStart. This also handles the dual-path case where
|
||||||
|
# both the web app's start_instance and the unit's ExecStartPre call
|
||||||
|
# the helper.
|
||||||
|
if os.path.ismount(merged_for_check):
|
||||||
|
if os.environ.get("LEFT4ME_OVERLAY_PRINT_ONLY") == "1":
|
||||||
|
print("ALREADY_MOUNTED")
|
||||||
|
return
|
||||||
|
|
||||||
instance_env = r / "instances" / name / "instance.env"
|
instance_env = r / "instances" / name / "instance.env"
|
||||||
raw_lowerdirs = parse_lowerdirs(instance_env)
|
raw_lowerdirs = parse_lowerdirs(instance_env)
|
||||||
|
|
||||||
allowed_roots = [(r / sub).resolve() for sub in LOWERDIR_ALLOWLIST]
|
allowed_roots = [(r / sub).resolve() for sub in LOWERDIR_ALLOWLIST]
|
||||||
canonical_lowerdirs = [str(canonical_under(allowed_roots, Path(p))) for p in raw_lowerdirs]
|
canonical_lowerdirs = [str(canonical_under(allowed_roots, Path(p))) for p in raw_lowerdirs]
|
||||||
|
|
||||||
runtime_name_dir = (r / "runtime" / name).resolve(strict=True)
|
|
||||||
upper = (runtime_name_dir / "upper").resolve(strict=True)
|
upper = (runtime_name_dir / "upper").resolve(strict=True)
|
||||||
work = (runtime_name_dir / "work").resolve(strict=True)
|
work = (runtime_name_dir / "work").resolve(strict=True)
|
||||||
merged = (runtime_name_dir / "merged").resolve(strict=True)
|
merged = merged_for_check
|
||||||
for label, path in (("upper", upper), ("work", work), ("merged", merged)):
|
for label, path in (("upper", upper), ("work", work), ("merged", merged)):
|
||||||
if path.parent != runtime_name_dir:
|
if path.parent != runtime_name_dir:
|
||||||
die(f"{label} resolved outside runtime/{name}: {path}")
|
die(f"{label} resolved outside runtime/{name}: {path}")
|
||||||
|
|
|
||||||
|
|
@ -78,6 +78,33 @@ def test_server_unit_contains_required_runtime_contract():
|
||||||
assert "LockPersonality=true" in unit
|
assert "LockPersonality=true" in unit
|
||||||
|
|
||||||
|
|
||||||
|
def test_server_unit_mounts_overlay_via_exec_start_pre():
|
||||||
|
"""At boot, systemd auto-starts enabled units before the web app gets a
|
||||||
|
chance to run start_instance's pre-start mount. The unit itself must
|
||||||
|
re-mount the overlay so reboots are transparent. Pairs with the helper's
|
||||||
|
idempotency check (test_overlay_helper_mount_is_idempotent_when_mounted).
|
||||||
|
"""
|
||||||
|
unit = SERVER_UNIT.read_text()
|
||||||
|
assert (
|
||||||
|
"ExecStartPre=/usr/bin/sudo -n /usr/local/libexec/left4me/left4me-overlay mount %i"
|
||||||
|
in unit
|
||||||
|
)
|
||||||
|
# Bound the restart loop; without these, a CHDIR-failure (or any other
|
||||||
|
# pre-start error) spins indefinitely.
|
||||||
|
assert "StartLimitBurst=5" in unit
|
||||||
|
assert "StartLimitIntervalSec=60s" in unit
|
||||||
|
|
||||||
|
|
||||||
|
def test_overlay_helper_mount_is_idempotent_when_already_mounted():
|
||||||
|
"""ExecStartPre runs on every Restart=on-failure cycle. If a previous
|
||||||
|
start mounted successfully but ExecStart failed afterwards, the next
|
||||||
|
ExecStartPre would re-mount on top -- which fails. The helper must
|
||||||
|
short-circuit when merged is already a mount point.
|
||||||
|
"""
|
||||||
|
text = OVERLAY_HELPER.read_text()
|
||||||
|
assert "os.path.ismount" in text
|
||||||
|
|
||||||
|
|
||||||
def test_server_unit_contains_perf_baseline_directives():
|
def test_server_unit_contains_perf_baseline_directives():
|
||||||
unit = SERVER_UNIT.read_text()
|
unit = SERVER_UNIT.read_text()
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue