Compare commits

...

2 commits

Author SHA1 Message Date
mwiegand
dd918aca4b
fix(left4me-overlay): use /proc/self/mountinfo to detect bind mounts
os.path.ismount() compares st_dev against the parent dir, which silently
returns False for same-fs bind mounts. The idmap binds at runtime/<n>/
idmap/<basename> are exactly that case, so:

- cmd_umount skipped the bind-umount step every stop, leaving orphan
  binds in PID 1's mount namespace.
- cmd_mount's idempotency check then "didn't see" the orphan and
  re-bound on top, accumulating one mount per start/stop cycle.

Findmnt nesting like
    /var/lib/left4me/runtime/2/idmap/overlays_9
    └─/var/lib/left4me/runtime/2/idmap/overlays_9
is the visible symptom. Reboot wipes everything so the bug is invisible
on a fresh boot — only stop/start cycles accumulate.

Replace both ismount sites with a _is_mountpoint() helper that reads
/proc/self/mountinfo (column 5 is the mount point). Keep os.path.ismount
for the overlay merged check, where it's reliable (distinct fs type).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-15 01:02:18 +02:00
mwiegand
2b20bffeb8
spec: handoff doc for rethinking deploy/ dir architecture
The 2026-05-15 script-consolidation pass landed a working but
half-finished mental model: deploy/files/ was retroactively promoted
from "historical reference" to "canonical source," but only for the
script files. Several adjacent things (sudoers/sysctl duplication
across both repos, the systemd unit files that ckn-bw's reactor
ignores, deploy-test-server.sh's role, dead-code apply-cake) didn't
get resolved. Capture the open questions and pointers so a future
session can pick this up and commit to a coherent shape.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-15 00:53:55 +02:00
3 changed files with 315 additions and 3 deletions

View file

@ -55,6 +55,31 @@ def die(msg: str) -> None:
sys.exit(1) sys.exit(1)
def _is_mountpoint(
path: str | Path,
mountinfo_path: str = "/proc/self/mountinfo",
) -> bool:
"""Reliable mount-point check that handles same-fs bind mounts.
`os.path.ismount()` compares `st_dev` of the path against its parent;
bind mounts on the same underlying filesystem share `st_dev` with their
parent, so `os.path.ismount()` returns False for them. The idmap binds
we install on `runtime/<n>/idmap/<basename>` are exactly that case.
Read /proc/self/mountinfo (field 5 is the mount point) for a check
that works regardless of mount type. The override is for tests only.
"""
abs_path = os.fspath(Path(path).resolve())
try:
with open(mountinfo_path, "r", encoding="utf-8") as f:
for line in f:
fields = line.split()
if len(fields) >= 5 and fields[4] == abs_path:
return True
except OSError:
pass
return False
def _lookup_uid(username: str) -> tuple[int, int]: def _lookup_uid(username: str) -> tuple[int, int]:
"""Return (uid, gid) for *username*, dying with a clear message if missing.""" """Return (uid, gid) for *username*, dying with a clear message if missing."""
try: try:
@ -256,7 +281,7 @@ def cmd_mount(name: str) -> None:
idmap_dir.mkdir(mode=0o700, exist_ok=True) idmap_dir.mkdir(mode=0o700, exist_ok=True)
idmap_target.mkdir(mode=0o700, exist_ok=True) idmap_target.mkdir(mode=0o700, exist_ok=True)
if not os.path.ismount(idmap_target) or \ if not _is_mountpoint(idmap_target) or \
os.environ.get("LEFT4ME_OVERLAY_PRINT_ONLY") == "1": os.environ.get("LEFT4ME_OVERLAY_PRINT_ONLY") == "1":
# --map-users / --map-groups argument format: # --map-users / --map-groups argument format:
# <on-disk-uid>:<in-mount-uid>:<count> # <on-disk-uid>:<in-mount-uid>:<count>
@ -359,10 +384,11 @@ def cmd_umount(name: str) -> None:
# Unwind idmap bind mounts, then remove the idmap directory. Each bind # Unwind idmap bind mounts, then remove the idmap directory. Each bind
# is only umounted if it is still a mountpoint (idempotent across partial # is only umounted if it is still a mountpoint (idempotent across partial
# teardowns). # teardowns). _is_mountpoint reads /proc/self/mountinfo because
# os.path.ismount misses same-fs bind mounts.
for bind_umount_argv in bind_umount_argvs: for bind_umount_argv in bind_umount_argvs:
target = Path(bind_umount_argv[-1]) target = Path(bind_umount_argv[-1])
if os.path.ismount(target): if _is_mountpoint(target):
subprocess.run(bind_umount_argv, check=True) subprocess.run(bind_umount_argv, check=True)
shutil.rmtree(idmap_dir, ignore_errors=True) shutil.rmtree(idmap_dir, ignore_errors=True)

View file

@ -0,0 +1,242 @@
# Deploy directory architecture — open questions
**Status: open questions, not a settled design.** This is a thinking-aloud
handoff prompted by the script-consolidation change on 2026-05-15. Decisions
deferred; a future session should pick this up, talk through the options,
and commit to one shape.
## What happened on 2026-05-15 that prompted this
Two changes landed in quick succession:
1. `left4me-overlay` grew idmap bind-mount support so kernel-overlayfs copy-up
from `l4d2-sandbox`-owned lowerdirs produces `left4me`-owned upperdir
entries (commits `2f6a9cf` + `9053186`).
2. Consolidated all five privileged scripts (4 libexec helpers + 1 sbin
admin CLI) so left4me owns the source of truth and ckn-bw `install`s
them from `/opt/left4me/src/deploy/files/usr/local/{libexec,sbin}/`
after `git_deploy` (left4me `f5e36ee`, ckn-bw `3ccaa91`).
During (2), several architectural assumptions got revised mid-flight rather
than thought through fully:
- `deploy/README.md` flipped from "Status: superseded, historical reference"
to "deploy/files/ is canonical, only `deploy-test-server.sh` is historical."
- The scripts kept their existing deeply-nested paths under
`deploy/files/usr/local/libexec/left4me/*` rather than moving to a
cleaner top-level layout (an earlier draft of the plan proposed `bin/`,
but the user pushed back on mixing the admin CLI with the helpers).
- The resulting state works but several things feel half-finished. This
document enumerates them so they don't rot.
## Current state to look at before deciding anything
- `deploy/files/usr/local/libexec/left4me/{left4me-systemctl,journalctl,overlay,script-sandbox,apply-cake}`
- `deploy/files/usr/local/sbin/left4me`
- `deploy/files/usr/local/lib/systemd/system/{left4me-server@.service,left4me-web.service,...}`**NOT** deployed; ckn-bw emits via reactor. Currently dead-but-kept-for-reference.
- `deploy/files/etc/{sudoers.d/left4me,sysctl.d/99-left4me.conf,left4me/sandbox-resolv.conf,left4me/cake.env}``sudoers.d/left4me` and `sysctl.d/99-left4me.conf` and `left4me/sandbox-resolv.conf` are shipped (verbatim, from ckn-bw's own copies — **still duplicated!**). `cake.env` is dead code.
- `deploy/templates/etc/left4me/{host.env,web.env.template}` — Mako-rendered by ckn-bw's `bundles/left4me/files/etc/left4me/{host.env.mako,web.env.mako}` (its own copies, **also duplicated**).
- `deploy/deploy-test-server.sh` — superseded one-shot bash installer.
- `deploy/tests/test_deploy_artifacts.py` — pytest assertions over the
files above. Currently canonical / load-bearing.
The script consolidation only handled `usr/local/libexec/left4me/*` and
`usr/local/sbin/left4me`. The other duplicated items above were not in
scope.
## Open question 1: what does `deploy/` mean?
Four framings, not mutually exclusive but each implies different next moves:
- **A. "Files to install onto the target"** — single source of truth for
every deployable artifact (scripts, configs, sudoers, sysctl, units,
env templates). ckn-bw becomes pure orchestration: users, groups,
dirs, apt, venv, install actions reading from deploy/.
- **B. "Deploy-mechanism artifacts only"** — installer scripts, runbook
docs, env-template *examples*. Real project executables live elsewhere
in the repo.
- **C. "Reference documentation of deploy decisions"** — historical-flavored.
Real source-of-truth lives in ckn-bw. This was the framing before
2026-05-15.
- **D. "Configuration for the deploy target"** — sudoers, sysctl,
sandbox-resolv.conf, env. Executables live elsewhere.
Today we drifted into **A** for the scripts, **C** lingering for the
systemd units, partial-A-partial-C for /etc/ stuff, and we promoted the
templates section without changing its actual role. Inconsistent.
Pick one and lean in.
## Open question 2: should the scripts live in deploy/ at all?
Argument for keeping them where they are:
- Source path = deploy target. Self-documenting.
- Zero churn from the just-landed consolidation.
Argument for moving them out (top-level `libexec/`, `sbin/`, or `bin/`):
- `deploy/` has historically meant "deploy mechanism." Putting 381-line
Python code (`left4me-overlay`) there mixes "deploy artifacts" with
"core project logic." `left4me-overlay` is real software; it has
tests, it gets edited like any other code.
- Nesting is deep: `deploy/files/usr/local/libexec/left4me/left4me-overlay`
is 5 levels of dir before the actual file.
- Shorter paths make Python constants more readable (the test file uses
`OVERLAY_HELPER = DEPLOY / "files/usr/local/libexec/left4me/left4me-overlay"`).
Counter to the move:
- The user pushed back on a flat `bin/` because it mixes admin CLI
(`left4me`, sbin role) with internal helpers (`left4me-overlay` et al.,
libexec role). A two-dir top-level layout (`libexec/` + `sbin/`) avoids
that mix at the cost of two top-level dirs.
Open variants:
- Flat top-level `bin/` (mixed roles, simplest)
- Top-level `libexec/` + `sbin/` (role-separated, two top-level dirs)
- Top-level `scripts/` with `libexec/` and `sbin/` subdirs (one umbrella)
- Stay in `deploy/files/usr/local/{libexec,sbin}/` (current)
## Open question 3: what to do with `deploy-test-server.sh`
The script duplicates ckn-bw's install logic in bash form. ckn-bw is
authoritative now; the script is at best stale documentation, at worst
actively misleading (the user almost-but-didn't run it against an ovh.left4me
node during one of the recent debugging passes).
Options:
- **Delete entirely.** ckn-bw is the deploy. Script's content survives
in git history if anyone wants to reference it.
- **Relocate to `docs/`** as a readable "what does deploy do?" walkthrough.
Drop the executable bit, mark it explicitly as docs-only.
- **Keep as-is.** README already says superseded; one extra warning in
the script header would suffice. Lowest churn, ongoing rot risk.
If we go with the consolidation direction (everything canonical in
left4me), keeping a `deploy-test-server.sh` that doesn't match the
canonical paths becomes a documentation bug. Maintaining it in sync
with ckn-bw's items.py is overhead nobody wants.
## Open question 4: bw responsibilities vs. file installs
Today's split:
- **bw owns:** users, groups, dirs, env files (Mako-templated with node
metadata), sudoers + sysctl + sandbox-resolv.conf (verbatim, **its own
copies**), systemd units (reactor-emitted from `metadata.py`), apt
packages, venv creation, pip install, alembic, seed-overlays, the
install action for privileged scripts.
- **left4me owns:** privileged scripts (via the install action reading
from `/opt/left4me/src/deploy/files/usr/local/{libexec,sbin}/`).
The split is inconsistent. ckn-bw ships its own copies of:
- `bundles/left4me/files/etc/sudoers.d/left4me`
- `bundles/left4me/files/etc/sysctl.d/99-left4me.conf`
- `bundles/left4me/files/etc/left4me/sandbox-resolv.conf`
- `bundles/left4me/files/etc/left4me/{host.env.mako,web.env.mako}`
And **left4me also has copies** of the first three at
`deploy/files/etc/{sudoers.d/left4me,sysctl.d/99-left4me.conf,left4me/sandbox-resolv.conf}`.
Either ckn-bw's are the source of truth (in which case left4me's are
stale/historical), or left4me's are (in which case we should extend the
install-from-checkout pattern to these too).
Mako-templated env files genuinely need bw's metadata access — those
probably stay in ckn-bw as the authoritative renderer. But the
templates themselves could live in left4me with placeholders that bw
substitutes. We're not far from that today.
The clean version of "left4me canonical" would have:
- Verbatim files (sudoers, sysctl, sandbox-resolv.conf, scripts) all in
`deploy/files/...` in left4me. ckn-bw's bundle files/ directory holds
nothing but the Mako env templates (which need bw's metadata).
- Sudoers gets `test_with: visudo -cf {}` — currently a property of
ckn-bw's files item. To preserve this when the file moves to install-
via-action, the action itself would need to run `visudo -cf
/opt/left4me/src/deploy/files/etc/sudoers.d/left4me` before the install
step. Doable but adds complexity.
The clean version of "split-by-purpose" would have:
- Verbatim files stay in ckn-bw (config bundles are bundles' jobs).
- Scripts in left4me, exactly as today.
- left4me's `deploy/files/etc/` becomes pure reference — and we should
either keep it explicitly labeled as such, or delete it to avoid
duplication drift.
Both are coherent. Today we have neither — half-and-half.
## Open question 5: dead-code cleanup
These files exist in `deploy/files/` but serve no live purpose:
- `usr/local/lib/systemd/system/{left4me-cake.service,left4me-nft-mark.service}` — units replaced by ckn-bw's reactor / nftables bundle.
- `usr/local/lib/systemd/system/{left4me-server@.service,left4me-web.service,left4me-workshop-refresh.{service,timer},l4d2-game.slice,l4d2-build.slice}` — also reactor-emitted, not installed from these files.
- `usr/local/libexec/left4me/left4me-apply-cake` — dead since CAKE moved to networkd. Currently ships via the new install glob (harmless extra file on `/usr/local/libexec/left4me/`).
- `usr/local/lib/left4me/nft/left4me-mark.nft` — central nftables bundle replaced this.
- `etc/left4me/cake.env` — replaced by node metadata.
Each one of these is a self-contained delete-when-someone-feels-like-it
job. Cumulatively they add up to enough noise that future readers will
get confused about what's load-bearing.
Probably worth a "deploy/ janitorial pass" PR that just deletes the
documented-as-obsolete files. Out of scope for whatever architectural
shift you commit to, but mention it as adjacent cleanup.
## Adjacent thing the script consolidation introduced
The `install_left4me_scripts` action in ckn-bw ships *everything* in
`deploy/files/usr/local/libexec/left4me/` to `/usr/local/libexec/left4me/`
via `install -t DEST .../left4me/*`. This is what makes the action
filename-agnostic. Side effect: `left4me-apply-cake` (dead code) gets
installed too. It does nothing on disk because no unit references it.
Three escape hatches:
- Delete the file from `deploy/files/...` (clean — kills dead code).
- Move the file out of the install path (e.g. to `docs/historical/`).
- Filter the glob (introduces a named exclusion; user explicitly didn't
want filename-naming in the action).
If the broader "open question 5" cleanup happens, this resolves itself.
## Recommended structure for the followup session
When picking this up:
1. Read `deploy/README.md` (current shape) and this doc.
2. Pick a position on **open question 1**: what does `deploy/` mean?
The answer constrains everything else.
3. Once 1 is settled, **open questions 2 and 4 fall out**: where do
scripts live, where do config files live.
4. **Open question 3** (`deploy-test-server.sh` fate) is independent of
the others and can be decided in isolation.
5. **Open question 5** (dead-code cleanup) is independent too;
probably worth doing alongside whatever else lands.
6. End state should be: the rules for "what goes in deploy/" can be
written in two sentences. Today they take a paragraph plus
exceptions.
## Pointers
- Current `deploy/README.md` has the current canonical/historical split.
- ckn-bw's bundle: `git.sublimity.de/cronekorkn/ckn-bw`,
`bundles/left4me/items.py`. The `install_left4me_scripts` action and
the files dict are the relevant entry points.
- Plan that landed the recent change:
`docs/superpowers/plans/2026-05-14-overlay-idmap.md` (idmap helper) and
the ~/.claude/plans scratch file for the script consolidation.
- Recent commit history that touched this surface:
- `f5e36ee` deploy: claim /usr/local/sbin/left4me admin CLI in deploy/files
- `2f6a9cf` + `9053186` left4me-overlay idmap support
- ckn-bw `3ccaa91` left4me: install privileged scripts from git_deploy artifact
## What I don't think is in scope here
- Rewriting the shell helpers in Python / packaging them as
console_scripts. Considered and rejected in the script-consolidation
plan because of the egg-info / TOCTOU privilege concern around
left4me-uid-writable bin dirs.
- Switching to a kernel-overlayfs alternative.
- Splitting the gameserver uid from the web app uid. Separate planned
change.

View file

@ -303,3 +303,47 @@ def test_rejects_upperdir_with_fuseoverlayfs_xattr(tmp_path: Path) -> None:
result = _run(["mount", "alpha"], tmp_path) result = _run(["mount", "alpha"], tmp_path)
assert result.returncode != 0 assert result.returncode != 0
assert "fuse-overlayfs xattr" in result.stderr assert "fuse-overlayfs xattr" in result.stderr
def _load_helper_module():
"""Import the helper script as a Python module for unit testing internals.
The helper file has no .py extension, so importlib needs an explicit
SourceFileLoader rather than auto-detection.
"""
import importlib.util
from importlib.machinery import SourceFileLoader
loader = SourceFileLoader("left4me_overlay", str(HELPER_SOURCE))
spec = importlib.util.spec_from_loader("left4me_overlay", loader)
assert spec is not None
module = importlib.util.module_from_spec(spec)
loader.exec_module(module)
return module
def test_is_mountpoint_detects_same_fs_bind_mount(tmp_path: Path) -> None:
"""_is_mountpoint reads /proc/self/mountinfo so it works for same-fs bind mounts.
Regression: os.path.ismount() compares st_dev against the parent, which
silently returns False for same-fs bind mounts. The idmap binds we install
on runtime/<n>/idmap/<basename> are exactly that case, so an ismount-based
check skipped umount on stop and re-bound on top on start accumulating
mount-table entries across stop/start cycles.
"""
helper = _load_helper_module()
target = tmp_path / "some-bind"
target.mkdir()
abs_target = str(target.resolve())
mountinfo = tmp_path / "fake-mountinfo"
# mountinfo column 5 is the mountpoint; build a minimal line that exercises
# the parse without depending on the rest of the format.
mountinfo.write_text(
f"42 1 0:30 / {abs_target} rw,relatime - tmpfs tmpfs rw\n"
f"43 1 0:31 / /some/other/path rw,relatime - tmpfs tmpfs rw\n"
)
assert helper._is_mountpoint(target, str(mountinfo)) is True
assert helper._is_mountpoint(tmp_path / "not-a-mount", str(mountinfo)) is False
assert helper._is_mountpoint(target, str(tmp_path / "no-such-file")) is False