CopilotKit/showcase/scripts/cli/_common.sh at main · codingwatching/CopilotKit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env bash
# Shared variables and helper functions for the showcase CLI.
# Sourced by bin/showcase — not meant to be executed directly.

# ── Paths ────────────────────────────────────────────────────────────────────

SHOWCASE_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
COMPOSE_FILE="$SHOWCASE_ROOT/docker-compose.local.yml"
COMPOSE_CMD="docker compose -f $COMPOSE_FILE"
ENV_FILE="$SHOWCASE_ROOT/.env"
PORTS_FILE="$SHOWCASE_ROOT/shared/local-ports.json"
AIMOCK_COMPOSE="$SHOWCASE_ROOT/tests/docker-compose.integrations.yml"

# ── Output helpers ───────────────────────────────────────────────────────────

die() {
  printf '\033[1;31m✗ %s\033[0m\n' "$1" >&2
  exit 1
}

info() {
  printf '\033[0;36m▸ %s\033[0m\n' "$1"
}

warn() {
  printf '\033[1;33m⚠ %s\033[0m\n' "$1" >&2
}

success() {
  printf '\033[0;32m✓ %s\033[0m\n' "$1"
}

# ── Validation helpers ───────────────────────────────────────────────────────

need_slug() {
  [ -n "${1:-}" ] || die "slug required"
}

require_env() {
  [ -f "$ENV_FILE" ] || die "Missing $ENV_FILE. Copy showcase/.env.example to showcase/.env and fill in keys."
}

# ── Docker / Compose helpers ─────────────────────────────────────────────────

stage_shared() {
  # Dereference tools/, shared-tools/, and _shared/ symlinks into real copies
  # so Docker COPY can follow them (Docker build contexts can't traverse
  # symlinks that point outside the context). `_shared` carries the
  # single-source CVDIAG bootstrap module into each Python integration context.
  for pkg_dir in "$SHOWCASE_ROOT"/integrations/*/; do
    for link_name in tools shared-tools _shared; do
      local link_path="$pkg_dir/$link_name"
      if [ -L "$link_path" ]; then
        local target
        target="$(readlink "$link_path")"
        # Resolve relative symlink targets against the link's directory
        if [[ "$target" != /* ]]; then
          target="$(cd "$(dirname "$link_path")" && cd "$(dirname "$target")" && pwd)/$(basename "$target")"
        fi
        if [ -d "$target" ]; then
          rm "$link_path"
          rsync -a "$target/" "$link_path/"
        fi
      fi
    done
  done
}

restore_symlinks() {
  # Restore tools/, shared-tools/, and _shared/ symlinks replaced by
  # stage_shared. The integrations/*/_shared glob also matches the canonical
  # source dir integrations/_shared (a real tracked dir) — harmless no-op there.
  (cd "$SHOWCASE_ROOT" && git checkout -- integrations/*/tools integrations/*/shared-tools integrations/*/_shared 2>/dev/null || true)
}

slug_to_container() {
  echo "showcase-${1}"
}

slug_to_port() {
  local slug="${1:?slug required}"
  if command -v jq &>/dev/null; then
    jq -r --arg s "$slug" '.[$s] // empty' "$PORTS_FILE"
  else
    # Fallback: simple grep/sed if jq is not available
    grep "\"$slug\"" "$PORTS_FILE" | sed 's/[^0-9]//g'
  fi
}

is_service_healthy() {
  local slug="${1:?slug required}"
  local container
  container="$(slug_to_container "$slug")"
  local health
  health="$(docker inspect --format='{{.State.Health.Status}}' "$container" 2>/dev/null || echo "missing")"
  [ "$health" = "healthy" ]
}

wait_healthy() {
  local slug="${1:?slug required}"
  local timeout="${2:-30}"
  local elapsed=0
  info "Waiting for $slug to become healthy (timeout ${timeout}s)..."
  while ! is_service_healthy "$slug"; do
    if [ "$elapsed" -ge "$timeout" ]; then
      die "$slug did not become healthy within ${timeout}s"
    fi
    sleep 2
    elapsed=$((elapsed + 2))
  done
  success "$slug is healthy (${elapsed}s)"
}

# ── Isolation helpers ───────────────────────────────────────────────────────

ISOLATE_NAME=""
ISOLATE_PORT_OFFSET=0
ISOLATE_SLOT=""
ISOLATE_ACTIVE=false
ISOLATE_TMPDIR=""
# Set true by cmd-test.sh when --keep is parsed; read by restore_isolation.
# Deliberately a namespaced GLOBAL (not a `local` in cmd_test): the EXIT trap
# fires at top-level script exit, after cmd_test has returned and its locals
# have unwound. Initializing it here also shields against a stray `keep`-like
# env var exported by the user flipping teardown behavior.
ISOLATE_KEEP=false

# Runtime state (slot registry + per-run scratch dirs) lives under
# XDG_STATE_HOME, NOT /tmp — /tmp is world-writable (which made stale-slot
# reaping racy) and gets wiped on reboot (which destroyed the registry/run-dir
# state out from under any surviving docker resources). NB this does NOT make
# --keep reboot-proof: container-liveness protection counts only RUNNING
# containers, so after a reboot (or daemon restart / manual docker stop) the
# kept stack's stopped containers no longer protect its slot — the next
# claim's sweep reclaims it, composing the remnants down (see
# _reap_isolate_slot).
_showcase_state_base() { printf '%s/copilotkit/showcase' "${XDG_STATE_HOME:-$HOME/.local/state}"; }

# Single-user assumption: the slot registry is PER-USER (XDG state), while
# docker compose project names and host ports are HOST-global. Two different
# UNIX users running --isolate concurrently on one host each get their own
# registry, so neither the slot claim nor the duplicate-name guard can see
# the other user's claims — identical port offsets or same-name projects can
# collide across users. Accepted: dev hosts are effectively single-user.
# Note the pid-liveness checks share this assumption: `kill -0` on another
# user's live pid returns EPERM (read here as "dead"), so cross-user slot
# protection via pid is also unreliable.
ISOLATE_SLOT_DIR="$(_showcase_state_base)/slots"
ISOLATE_STALE_THRESHOLD=7200  # 2 hours in seconds — slot-age fallback
# TTL on a `kept` stack (running containers whose owning process is gone or
# unverifiable — a forgotten `--keep` leak). Once a kept slot's age exceeds this
# TTL it is reclassified `stale` and reaped by the sweep, so a --keep'd stack
# left running with no owner cannot accumulate indefinitely. Default 4 hours.
# Overridable via SHOWCASE_ISOLATE_KEEP_TTL (e.g. for tests / longer sessions).
ISOLATE_KEEP_TTL="${SHOWCASE_ISOLATE_KEEP_TTL:-14400}"  # 4 hours in seconds
# The sweep lock is held only for the duration of one sweep pass (seconds, even
# with all 46 slots populated). A crashed sweeper's leftover lock must not
# disable stale reaping for the full 2-hour SLOT threshold — give the lock its
# own, much shorter staleness threshold.
ISOLATE_SWEEP_LOCK_STALE_THRESHOLD=60  # seconds
# Maximum slot index for --isolate (0 reserved for base stack; 1..N for isolated runs).
ISOLATE_MAX_SLOT=45

# _file_mtime <path> — epoch mtime of a path, or empty when it cannot be
# stat'ed (vanished concurrently, permissions). Callers must treat a
# non-numeric result as "unknown", never as zero.
_file_mtime() {
  if [[ "$OSTYPE" == darwin* ]]; then
    stat -f %m "$1" 2>/dev/null || true
  else
    stat -c %Y "$1" 2>/dev/null || true
  fi
}

# _kept_slot_age <slot> — age in seconds of a slot for the ISOLATE_KEEP_TTL
# comparison, or empty when no anchor can be stat'ed. The TTL anchor is the
# `pid` file's mtime: it is written ONCE at claim (~line 406) and never
# rewritten, so it is a stable claim-time stamp (and `pid.start` is a SIBLING
# file, so writing it never disturbs `pid`'s mtime). Mandatory fallback chain so
# a kept slot is never immortal even if `pid` is gone: pid-file mtime →
# `project`-file mtime → slot-dir mtime. If NONE of these can be stat'ed the
# caller falls back to the existing ISOLATE_STALE_THRESHOLD age path; without
# that fallback an unstattable anchor would skip the age comparison and the
# kept→stale transition would silently never fire.
_kept_slot_age() {
  local slot_entry="$ISOLATE_SLOT_DIR/${1:?slot required}"
  local anchor anchor_mtime
  for anchor in "$slot_entry/pid" "$slot_entry/project" "$slot_entry"; do
    anchor_mtime="$(_file_mtime "$anchor")"
    if [[ "$anchor_mtime" =~ ^[0-9]+$ ]]; then
      printf '%d\n' "$(( $(date +%s) - anchor_mtime ))"
      return 0
    fi
  done
  return 0
}

# _pid_start_time <pid> — the process start time of <pid> as an opaque,
# platform-native string, or empty when it cannot be read (no such pid, EPERM
# on a cross-user pid, or an unsupported platform). This is the anti-PID-reuse
# fingerprint: a recycled pid lands on a DIFFERENT process with a DIFFERENT
# start time, so a recorded-vs-current mismatch means the original owner is
# gone. The exact textual format is never interpreted — it only has to be
# stable for one process's lifetime and to DIFFER across a pid recycle, which
# both forms below satisfy. Written to a `pid.start` sibling of the slot's
# `pid` file at claim and re-read at verify; the SAME function produces both
# sides so the comparison can never drift across a format change.
#   macOS:  `ps -o lstart=` — the full "Wed Jun 26 11:33:20 2026" start stamp.
#   Linux:  field 22 of /proc/<pid>/stat — starttime in clock ticks since boot.
_pid_start_time() {
  local pid="${1:?pid required}"
  [[ "$pid" =~ ^[0-9]+$ ]] || return 0
  if [[ "$OSTYPE" == darwin* ]]; then
    # ps prints a fixed-format date; trim surrounding whitespace so a stray
    # leading/trailing space can never manufacture a spurious mismatch.
    local out
    out="$(ps -o lstart= -p "$pid" 2>/dev/null || true)"
    printf '%s' "$out" | awk '{$1=$1; print}'
  elif [ -r "/proc/$pid/stat" ]; then
    # /proc/<pid>/stat: comm (field 2) is parenthesized and may contain spaces;
    # split on the LAST ')' so the numeric fields after it line up regardless.
    local stat rest
    stat="$(cat "/proc/$pid/stat" 2>/dev/null || true)"
    [ -n "$stat" ] || return 0
    rest="${stat##*) }"
    # After comm+state, starttime is field 22 of the full line == field 20 of
    # `rest` (rest begins at field 3 = state). state ppid pgrp session tty_nr
    # tpgid flags minflt cminflt majflt cmajflt utime stime cutime cstime
    # priority nice num_threads itrealvalue starttime → 20th token.
    printf '%s' "$rest" | awk '{print $20}'
  fi
}

# Reap one stale slot: compose any docker remnants of the recorded project
# down (best-effort), then remove the slot's runs/<project> scratch dir AND
# the slot dir itself. Without the runs-dir removal, crashed runs leak orphan
# run dirs under XDG state forever (nothing else cleans them —
# restore_isolation only removes the CURRENT run's dir).
#
# Kept stacks: container-liveness protection applies only while containers
# are RUNNING (the sweep's probe is `docker ps -q`, deliberately — `-aq`
# would let crashed runs' exited containers protect dead slots forever). A
# --keep'd stack whose containers are stopped-but-present (manual `docker
# stop`, daemon restart, host reboot) therefore DOES reach this function:
# its owner pid is dead by design, so the slot is reclaimed. The compose-down
# below keeps that safe — stopped containers and named volumes are removed
# along with the state dirs instead of being stranded with no compose state.
#
# Order matters: runs/<project> FIRST, slot dir LAST. The slot's project
# record is the ONLY pointer to the runs dir — a crash between the two
# removals with the old order (slot first) orphaned the runs dir forever,
# while with this order a surviving slot record simply makes the next sweep
# retry the reap.
_reap_isolate_slot() {
  local slot_entry="${1:?slot entry required}"
  local slot_proj="${2:-}"
  if [ -n "$slot_proj" ]; then
    # The record comes from a user-writable state file — never interpolate it
    # into rm -rf unvalidated (a corrupted record like "../.." would traverse
    # out of the runs dir). Compose project names are [a-z0-9][a-z0-9_-]*; on
    # mismatch, warn and leave the SLOT intact too: the record is the ONLY
    # pointer to the runs dir (see the header above), so reaping the slot
    # anyway would orphan whatever runs dir the record actually points at.
    # A corrupted record is a bug or tampering — leave the evidence in place
    # for manual inspection rather than half-destroy it.
    #
    # Reserved name, same treatment: 'showcase' IS the default stack's compose
    # project name and PASSES the charset check below, so a record reading
    # 'showcase' (a corrupt record, or one written by an older CLI version
    # before apply_isolation reserved the name) would aim the compose-down at
    # the user's LIVE DEFAULT stack — and --volumes would destroy its
    # PocketBase data. apply_isolation refuses the name at claim time, but the
    # reaper must not trust records: warn and leave the whole slot intact for
    # manual inspection (no compose-down, no state removal).
    if [ "$slot_proj" = "showcase" ]; then
      warn "Slot record at $slot_entry names the RESERVED project 'showcase' — that is the LIVE default stack's compose project, so reaping it would compose the default stack down (--volumes included: PocketBase data destroyed). Leaving the slot intact for manual inspection; its runs dir would be $(_showcase_state_base)/runs/$slot_proj"
      return 0
    fi
    if [[ "$slot_proj" =~ ^[a-z0-9][a-z0-9_-]*$ ]]; then
      # Best-effort remnant cleanup BEFORE deleting any state: a stopped kept
      # stack (see the header) still has containers + named volumes; deleting
      # the run dir + slot first would strand them with no compose state
      # (split-brain). `compose -p` resolves resources via project labels, so
      # no -f compose file is needed; failure (daemon down, nothing to remove)
      # is non-fatal — the rm below still reclaims the state dirs.
      docker compose -p "$slot_proj" down --remove-orphans --volumes >/dev/null 2>&1 || true
      # State-removal rms are guarded throughout this file: a concurrent
      # claimant/release can race the same path, and the loser's mid-traversal
      # ENOENT makes rm exit nonzero — which must not kill the CLI under
      # bin/showcase's `set -e` (the state is gone either way).
      rm -rf "$(_showcase_state_base)/runs/$slot_proj" 2>/dev/null || true
    else
      warn "Slot record at $slot_entry names suspicious project '$slot_proj' (path-traversal guard) — leaving the slot intact for manual inspection; its runs dir would be $(_showcase_state_base)/runs/$slot_proj"
      return 0
    fi
  fi
  rm -rf "$slot_entry" 2>/dev/null || true
}

# Release the sweep lock — but ONLY if it is still ours. The takeover path
# below can legitimately move an over-age lock out from under a slow-but-live
# holder and install a fresh lock of its own; if the original holder then
# blindly removed "$sweep_lock" on its way out, it would destroy the
# TAKEOVER's lock and open the door to a THIRD concurrent sweeper. Ownership
# is the pid file written into the lock dir at acquisition.
_release_sweep_lock() {
  local sweep_lock="${1:?sweep lock path required}"
  # Lock (or its pid ownership marker) gone entirely: nothing to release and
  # no holder to report — a takeover mv'd it away, or something external
  # cleaned it up. Distinct from the takeover case below, which has an actual
  # current holder's lock that must be left in place.
  if [ ! -d "$sweep_lock" ] || [ ! -f "$sweep_lock/pid" ]; then
    warn "Sweep lock $sweep_lock vanished while we held it (takeover or external cleanup) — leaving as-is"
    return 0
  fi
  local lock_pid
  lock_pid="$(cat "$sweep_lock/pid" 2>/dev/null || true)"
  if [ "$lock_pid" = "$$" ]; then
    rm -rf "$sweep_lock"
  else
    warn "Sweep lock $sweep_lock was taken over while we held it (current holder pid: ${lock_pid:-unknown}) — leaving it in place"
  fi
}

# Claim an isolation slot using atomic mkdir. Slots 1..ISOLATE_MAX_SLOT are
# usable for --isolate runs; slot 0 is reserved for the base (non-isolate)
# stack. Each slot dir contains a "pid" file for stale-detection. The port
# offset is (slot + 1) * 200, so slot 1 → +400, slot 2 → +600, etc. If
# SHOWCASE_ISO_SLOT is set, the picker pins to that slot; otherwise it
# auto-picks the first free slot in 1..ISOLATE_MAX_SLOT.
_claim_isolate_slot() {
  mkdir -p "$ISOLATE_SLOT_DIR"

  # Reclaim crashed-takeover tombstones: a sweeper that died between the
  # takeover mv and its rm -rf (below) leaves .sweep.lock.tomb.<pid> behind
  # forever — dot-named, so neither the sweep glob nor the claim loop ever
  # sees it, and nothing else cleans it. Age them by the LOCK threshold: a
  # fresh tombstone may belong to a takeover in flight (mv done, rm pending),
  # so only over-age ones are removed.
  local tomb
  for tomb in "$ISOLATE_SLOT_DIR"/.sweep.lock.tomb.*; do
    [ -e "$tomb" ] || continue
    local tomb_mtime
    tomb_mtime="$(_file_mtime "$tomb")"
    [[ "$tomb_mtime" =~ ^[0-9]+$ ]] || continue
    if [ $(( $(date +%s) - tomb_mtime )) -gt "$ISOLATE_SWEEP_LOCK_STALE_THRESHOLD" ]; then
      # This cleanup runs OUTSIDE the sweep lock by design: two claimants can
      # both observe the same over-age tombstone and race the removal, and the
      # loser's mid-traversal ENOENT makes rm exit nonzero — which must not
      # kill the CLI under `set -e` (losing the race is fine; the tombstone is
      # gone either way).
      rm -rf "$tomb" 2>/dev/null || true
    fi
  done

  # Serialize the stale sweep with a lock dir. Without it, two concurrent
  # claimants can both observe slot N stale: A reaps + re-claims it (writing a
  # live pid), then B reaps A's FRESH claim based on its stale observation and
  # claims the same slot — two owners, identical port offsets. The lock is
  # advisory and non-blocking: if another process holds it, we SKIP the sweep
  # entirely (that process is already sweeping) and go straight to the claim
  # loop. The dot-name keeps the lock out of the sweep's [0-9]* glob and the
  # claim loop's numeric slot names.
  local sweep_lock="$ISOLATE_SLOT_DIR/.sweep.lock"
  local have_sweep_lock=false
  if mkdir "$sweep_lock" 2>/dev/null; then
    echo "$$" > "$sweep_lock/pid"   # ownership marker for _release_sweep_lock
    have_sweep_lock=true
  else
    # Lock held — but a sweeper that crashed mid-sweep would leave it behind
    # forever, permanently disabling stale reaping. Take over an over-age lock
    # (dedicated short threshold: the lock is held for seconds, not hours);
    # otherwise (fresh lock, or lock vanished between our mkdir and the stat)
    # skip the sweep this round. A LIVE sweeper refreshes the lock mtime every
    # slot iteration (heartbeat in _sweep_isolate_slots), so an over-age lock
    # really does mean a crashed/wedged holder.
    local lock_mtime
    lock_mtime="$(_file_mtime "$sweep_lock")"
    if [[ "$lock_mtime" =~ ^[0-9]+$ ]] \
      && [ $(( $(date +%s) - lock_mtime )) -gt "$ISOLATE_SWEEP_LOCK_STALE_THRESHOLD" ]; then
      # Atomic takeover: rename the stale lock aside to a unique tombstone
      # first. Two claimants can BOTH observe the lock over-age; with a plain
      # rm+mkdir the slower one could rm the faster one's FRESH replacement
      # lock and retake it — two concurrent sweepers. rename(2) is atomic:
      # exactly one mv wins, the loser's mv fails and it simply skips the
      # sweep this round (it must NOT remove a lock the winner may already
      # have refreshed). The winner disposes of the tombstone and takes a
      # brand-new lock. A crash between mv and rm leaves only a dot-named
      # tombstone, invisible to both the sweep glob and the claim loop —
      # reclaimed once over-age by the tombstone cleanup at the top of this
      # function.
      local lock_tombstone="$ISOLATE_SLOT_DIR/.sweep.lock.tomb.$$"
      if mv "$sweep_lock" "$lock_tombstone" 2>/dev/null; then
        warn "Removing stale sweep lock (crashed sweeper?): $sweep_lock"
        # Guarded: mv preserves the lock's (already over-age) mtime, so this
        # fresh tombstone is immediately over-age too — concurrent claimants'
        # tombstone-reclamation loops (top of this function) legitimately race
        # this removal, and the loser's nonzero rm must not kill the CLI.
        rm -rf "$lock_tombstone" 2>/dev/null || true
        if mkdir "$sweep_lock" 2>/dev/null; then
          echo "$$" > "$sweep_lock/pid"   # ownership marker for _release_sweep_lock
          have_sweep_lock=true
        fi
      fi
    fi
  fi

  if [ "$have_sweep_lock" = true ]; then
    _sweep_isolate_slots
    _release_sweep_lock "$sweep_lock"
  fi

  if [ -n "${SHOWCASE_ISO_SLOT:-}" ]; then
    # Pinned path
    local pinned="$SHOWCASE_ISO_SLOT"
    [[ "$pinned" =~ ^[0-9]+$ ]] || die "SHOWCASE_ISO_SLOT must be a positive integer, got: $pinned"
    [ "$pinned" -ge 1 ] || die "slot 0 is reserved for the base stack — use 1-$ISOLATE_MAX_SLOT"
    [ "$pinned" -le "$ISOLATE_MAX_SLOT" ] || die "SHOWCASE_ISO_SLOT=$pinned exceeds ISOLATE_MAX_SLOT=$ISOLATE_MAX_SLOT"

    local slot_dir="$ISOLATE_SLOT_DIR/$pinned"
    if mkdir "$slot_dir" 2>/dev/null; then
      :   # fresh claim, fall through to port probe
    else
      # EEXIST: consult liveness
      local liveness
      liveness=$(_slot_liveness "$pinned")
      if [ "$liveness" = "live" ]; then
        # Identify the live axis for the message
        local axis="containers/pid"
        die "Slot $pinned is already in use (liveness=$liveness, $axis) — pick a different SHOWCASE_ISO_SLOT or clear it first"
      fi
      # stale or inconclusive: reap and retry once
      local pinned_entry="$ISOLATE_SLOT_DIR/$pinned"
      local pinned_proj
      pinned_proj="$(cat "$pinned_entry/project" 2>/dev/null || true)"
      _reap_isolate_slot "$pinned_entry" "$pinned_proj" || true
      mkdir "$slot_dir" 2>/dev/null || die "Slot $pinned could not be reclaimed after reap — check $slot_dir manually"
    fi
    # Port-probe
    if ! _slot_ports_free "$pinned"; then
      rmdir "$slot_dir" 2>/dev/null || true
      die "Slot $pinned ports are held by a foreign process — see info messages above; clear conflicts or pick a different SHOWCASE_ISO_SLOT"
    fi
    ISOLATE_SLOT="$pinned"
  else
    # Auto-pick path: loop 1..ISOLATE_MAX_SLOT (slot 0 reserved)
    local n=1
    while [ "$n" -le "$ISOLATE_MAX_SLOT" ]; do
      local slot_dir="$ISOLATE_SLOT_DIR/$n"
      if mkdir "$slot_dir" 2>/dev/null; then
        if _slot_ports_free "$n"; then
          ISOLATE_SLOT="$n"
          break
        else
          rmdir "$slot_dir" 2>/dev/null || true
          info "Slot $n ports held, trying next"
          # Benign race: between our rmdir and the next iteration's mkdir attempt, a concurrent
          # claimant can mkdir this same slot dir. That's fine — mkdir is the
          # atomic synchronization point, so only one process can hold a given
          # slot at a time. The concurrent claimant wins; we advance to n+1 and
          # no double-claim occurs. Port-probe and ownership-write (pid file) are
          # also per-slot, so there is no cross-claimant corruption under load.
        fi
      fi
      n=$((n + 1))
    done
    [ -n "${ISOLATE_SLOT:-}" ] || die "No isolation slots available (1-$ISOLATE_MAX_SLOT exhausted)"
  fi

  # Common post-claim. Write order is load-bearing: `pid` FIRST (preserving the
  # "pid written before the project record" invariant the liveness classifier
  # relies on — a missing pid file with a recorded project means the owner is
  # genuinely gone), THEN `pid.start`. `pid.start` is the anti-reuse
  # fingerprint: the owning process's start time, re-read and compared at
  # liveness time so a recycled pid (same number, different process, different
  # start time) reads as "owner gone" rather than spuriously alive. It is a
  # SIBLING file, written AFTER pid, so it never perturbs the `pid` file's own
  # mtime (which the kept-slot TTL anchor depends on). A crash between the two
  # writes leaves `pid` but no `pid.start` → owner "unverifiable" → treated as
  # dead, which is the safe direction.
  echo "$$" > "$ISOLATE_SLOT_DIR/$ISOLATE_SLOT/pid"
  _pid_start_time "$$" > "$ISOLATE_SLOT_DIR/$ISOLATE_SLOT/pid.start"
  ISOLATE_PORT_OFFSET=$(( (ISOLATE_SLOT + 1) * 200 ))
  return 0
}

# _owner_liveness <slot> — classify the slot's OWNING PROCESS, independent of
# any container state. Prints exactly one word and exits 0:
#   alive        — pid file present + numeric + kill -0 succeeds AND the pid's
#                  current start time matches the recorded pid.start.
#   reused       — kill -0 succeeds but the current start time DIFFERS from the
#                  recorded pid.start: the pid was recycled to a NEW process,
#                  so the original owner is gone.
#   dead         — pid file present + numeric but kill -0 fails (ESRCH, or
#                  EPERM on a cross-user pid — both read as "not our owner",
#                  matching the single-user model documented at the top of
#                  this file; we do NOT parse kill -0 stderr, which is
#                  locale/platform fragile).
#   unverifiable — pid file present + numeric + alive, but no readable
#                  pid.start to verify against (legacy slot written before the
#                  pid.start invariant, a crash between the pid and pid.start
#                  writes, or a platform that cannot read process start times).
#                  Treated as "owner gone" by every caller — REMOVES the old
#                  bare-kill-0 reuse hole at the cost of demoting a legacy
#                  live-owner slot to kept (TTL-reaped) instead of protected.
#   absent       — no pid file, or its contents are empty/non-numeric
#                  (inconclusive: a truncated pid write, or a project-less
#                  legacy slot). Distinct from `dead`: callers route this to
#                  the age fallback, never to an immediate PID-driven reap.
#
# This is the SINGLE source of truth for owner liveness, shared by
# _slot_liveness (the live|kept|stale classifier) and _slot_state (the table's
# PID annotation) so the two can never diverge.
_owner_liveness() {
  local slot="${1:?slot required}"
  local slot_entry="$ISOLATE_SLOT_DIR/$slot"
  local slot_pid_file="$slot_entry/pid"
  local slot_pid=""
  if [ -f "$slot_pid_file" ]; then
    slot_pid="$(cat "$slot_pid_file" 2>/dev/null || true)"
  fi
  if ! [[ "$slot_pid" =~ ^[0-9]+$ ]]; then
    printf 'absent\n'
    return 0
  fi
  # kill -0 failure (ESRCH or EPERM) → the pid is not a process we own → dead.
  if ! kill -0 "$slot_pid" 2>/dev/null; then
    printf 'dead\n'
    return 0
  fi
  # Pid is alive — but is it the SAME process we recorded? Verify start time.
  local recorded_start=""
  if [ -f "$slot_entry/pid.start" ]; then
    recorded_start="$(cat "$slot_entry/pid.start" 2>/dev/null || true)"
  fi
  if [ -z "$recorded_start" ]; then
    # No fingerprint to verify against — cannot prove this is our owner.
    printf 'unverifiable\n'
    return 0
  fi
  local current_start
  current_start="$(_pid_start_time "$slot_pid")"
  if [ -z "$current_start" ]; then
    # Pid is alive (kill -0 ok) but its start time is unreadable (e.g. EPERM on
    # a cross-user pid) — cannot confirm identity → treat as unverifiable.
    printf 'unverifiable\n'
    return 0
  fi
  if [ "$current_start" = "$recorded_start" ]; then
    printf 'alive\n'
  else
    printf 'reused\n'
  fi
  return 0
}

# Classify a single isolation slot as live | kept | stale | inconclusive —
# pure classification, no reaping, no info logging. Shared between
# _sweep_isolate_slots (which reaps stale slots) and the picker (which avoids
# binding to live slots). Always prints exactly one word to stdout and exits 0.
#
# Governing rule: when a slot has RUNNING containers, the container check wins
# → the slot is `kept` or `live`, NEVER reaped solely on an owner-PID result.
# Owner liveness only UPGRADES a running-container slot from TTL-bounded `kept`
# to indefinitely-protected `live`; it can never by itself make a
# running-container slot eligible for immediate reaping.
#
# Signals (in order):
#   1. Compose-project containers first. Docker-ps failure → inconclusive
#      (warn and leave it alone, unchanged). If containers ARE running, branch
#      on owner liveness:
#        - owner alive (start-time-verified)            → live
#        - owner dead / reused / unverifiable / absent  → kept: owning
#          process gone (or unprovable) but the project still has running
#          containers. NOT live, NOT immediately stale. The kept-slot TTL
#          (below) governs the kept→stale transition: a `kept` slot is left
#          alone until it outlives ISOLATE_KEEP_TTL, then ages out to stale.
#   2. No running containers (or none recorded). The owner PID is authoritative
#      for "in active use":
#        - owner alive (start-time-verified)            → live (e.g. mid-build
#          before any container exists)
#        - owner dead OR reused                         → stale
#   3. Project recorded + no pid file (owner absent) + no running containers
#      → stale (claim writes the pid file BEFORE the project record, so a
#      missing pid means the owner state is genuinely gone). Unchanged.
#   4. Age fallback — owner absent/unverifiable (missing/empty/non-numeric pid,
#      or a live-but-unverifiable owner on a project-less legacy slot) AND age
#      > ISOLATE_STALE_THRESHOLD → stale. Unchanged.
#   5. Otherwise → inconclusive.
_slot_liveness() {
  local slot="${1:?slot required}"
  local slot_entry="$ISOLATE_SLOT_DIR/$slot"
  if [ ! -d "$slot_entry" ]; then
    printf 'inconclusive\n'
    return 0
  fi
  local owner
  owner="$(_owner_liveness "$slot")"
  local slot_proj has_proj=false
  slot_proj="$(cat "$slot_entry/project" 2>/dev/null || true)"
  if [ -n "$slot_proj" ]; then
    has_proj=true
    local live_containers
    if ! live_containers="$(docker ps -q --filter "label=com.docker.compose.project=$slot_proj" 2>/dev/null)"; then
      warn "Cannot verify liveness of slot $slot (docker ps failed) — leaving it alone"
      printf 'inconclusive\n'
      return 0
    fi
    if [ -n "$live_containers" ]; then
      # Running containers → the container check wins. A live, start-time-
      # verified owner protects the slot indefinitely (`live`); any other
      # owner state (dead/reused/unverifiable/absent) means the owning process
      # is gone or unprovable while containers still run → `kept`.
      if [ "$owner" = "alive" ]; then
        printf 'live\n'
        return 0
      fi
      # ── TTL on running kept stacks ────────────────────────────────────────
      # The owner is gone/unprovable while containers still run → `kept`. A
      # `kept` stack is protected only until it outlives ISOLATE_KEEP_TTL: a
      # forgotten `--keep` must not accumulate indefinitely. Age anchors on the
      # `pid`-file mtime (stable claim-time stamp), with the mandatory fallback
      # chain in _kept_slot_age (pid → project → slot-dir mtime → the existing
      # ISOLATE_STALE_THRESHOLD path) so a kept slot is never immortal.
      local kept_age
      kept_age="$(_kept_slot_age "$slot")"
      if [[ "$kept_age" =~ ^[0-9]+$ ]]; then
        if [ "$kept_age" -gt "$ISOLATE_KEEP_TTL" ]; then
          printf 'stale\n'
          return 0
        fi
        printf 'kept\n'
        return 0
      fi
      # No anchor was stat'able: fall back to the slot-age / ISOLATE_STALE_
      # THRESHOLD path below so an unanchored kept slot still ages out to stale
      # rather than living forever.
      local fallback_mtime
      fallback_mtime="$(_file_mtime "$slot_entry")"
      if [[ "$fallback_mtime" =~ ^[0-9]+$ ]]; then
        local fallback_age
        fallback_age=$(( $(date +%s) - fallback_mtime ))
        if [ "$fallback_age" -gt "$ISOLATE_STALE_THRESHOLD" ]; then
          printf 'stale\n'
          return 0
        fi
      fi
      printf 'kept\n'
      return 0
    fi
  fi
  # No running containers (or none recorded): the owner PID is authoritative.
  if [ "$owner" = "alive" ]; then
    printf 'live\n'
    return 0
  fi
  # A numeric owner pid that is dead, reused, or alive-but-unverifiable is
  # authoritative proof the original owner is gone (no containers to defer to)
  # → stale. `absent` (no numeric pid at all) is NOT proof — it routes to the
  # project / age fallbacks below.
  if [ "$owner" = "dead" ] || [ "$owner" = "reused" ] || [ "$owner" = "unverifiable" ]; then
    printf 'stale\n'
    return 0
  fi
  # owner is `absent` from here on (no pid file, or empty/non-numeric contents).
  if [ "$has_proj" = true ] && [ ! -f "$slot_entry/pid" ]; then
    # Project recorded, no live containers, and no pid file AT ALL — the claim
    # writes the pid file BEFORE the project record, so a missing pid means the
    # owner state is genuinely gone → stale. A present-but-empty/non-numeric
    # pid file is NOT this case: it may be a live owner mid-build whose pid
    # write was truncated, so it defers to the age fallback below.
    printf 'stale\n'
    return 0
  fi
  local slot_mtime
  slot_mtime="$(_file_mtime "$slot_entry")"
  if [[ "$slot_mtime" =~ ^[0-9]+$ ]]; then
    local slot_age
    slot_age=$(( $(date +%s) - slot_mtime ))
    if [ "$slot_age" -gt "$ISOLATE_STALE_THRESHOLD" ]; then
      printf 'stale\n'
      return 0
    fi
  fi
  printf 'inconclusive\n'
  return 0
}

# Sweep stale slots. Caller (_claim_isolate_slot) MUST hold .sweep.lock.
_sweep_isolate_slots() {
  # Staleness signals, in order:
  #   1. Compose-project liveness: RUNNING containers always protect the slot
  #      (this is what keeps a --keep'd stack — owning process gone, containers
  #      still up — from being stolen). RUNNING only, deliberately (`docker ps
  #      -q`, not `-aq`): exited containers from crashed runs must not protect
  #      dead slots forever, so a kept stack whose containers were STOPPED
  #      (docker stop, daemon restart, reboot) is reclaimed — with its
  #      remnants composed down by _reap_isolate_slot. A docker failure is NOT
  #      "no containers": if we cannot ask, we leave the slot alone.
  #   2. Owning-PID liveness: a live owning PID always protects the slot. This
  #      matters because apply_isolation records the project BEFORE any
  #      container starts (image builds can take minutes), so "project recorded
  #      + zero containers" alone is NOT proof of staleness.
  #   3. Age: fallback when the pid check is inconclusive — the pid file is
  #      missing on a slot with no recorded project (legacy slots predating
  #      the "project" file), or the pid file EXISTS but its contents are
  #      empty/non-numeric on ANY slot (possibly a live owner whose pid write
  #      was truncated — inconclusive, so it defers to the age fallback
  #      rather than being reaped immediately; once the slot is older than
  #      ISOLATE_STALE_THRESHOLD it IS reaped, inconclusive pid and all,
  #      so such slots don't leak forever). A project-recorded slot
  #      with NO pid file at all is reaped directly: the claim writes the pid
  #      file before the project record, so its absence means the owner state
  #      is genuinely gone.
  local sweep_lock="$ISOLATE_SLOT_DIR/.sweep.lock"
  local slot_entry
  for slot_entry in "$ISOLATE_SLOT_DIR"/[0-9]*; do
    [ -d "$slot_entry" ] || continue
    # Heartbeat: refresh the lock mtime at the top of every iteration so a
    # LIVE sweep never looks over-age to a concurrent claimant. A full sweep
    # makes up to 46 `docker ps` calls; a wedged daemon can stretch that past
    # ISOLATE_SWEEP_LOCK_STALE_THRESHOLD, and without the heartbeat the
    # claimant would "take over" the lock from a sweeper that is still
    # running. Refresh-only, NEVER create: -c behind the -d guard. A bare
    # `touch` here used to RECREATE the lock as a plain FILE when a takeover
    # mv'd the dir away mid-iteration — the takeover's mkdir then failed
    # against the file and sweeping wedged until the 60s over-age self-heal.
    # Failure/vanished lock is non-fatal (_release_sweep_lock handles the
    # taken-over/vanished cases on the way out).
    [ -d "$sweep_lock" ] && touch -c "$sweep_lock" 2>/dev/null || true
    local slot_name
    slot_name="$(basename "$slot_entry")"
    local liveness
    liveness="$(_slot_liveness "$slot_name")"
    if [ "$liveness" = "live" ] || [ "$liveness" = "kept" ] || [ "$liveness" = "inconclusive" ]; then
      # `live` → in active use (running containers + live verified owner, or a
      # live verified owner mid-build). `kept` → running containers whose owner
      # is gone/unprovable — a --keep'd stack — protected until it outlives
      # ISOLATE_KEEP_TTL, at which point _slot_liveness returns `stale` and the
      # reap path below (with the loud kept-past-TTL warning) fires.
      # `inconclusive` → docker-ps failure (already warned by _slot_liveness),
      # or a slot dir that vanished mid-check, or a fresh-but-not-yet-aged slot
      # whose pid write hasn't landed. Either way: leave it alone.
      continue
    fi
    # Stale. Re-derive the evidence to emit the exact reason in the info line
    # before reaping. The reads here mirror _slot_liveness — kept in the
    # sweeper so the helper stays purely classifying.
    local slot_proj has_proj=false
    slot_proj="$(cat "$slot_entry/project" 2>/dev/null || true)"
    [ -n "$slot_proj" ] && has_proj=true
    local slot_pid_file="$slot_entry/pid"
    local slot_pid="" pid_file_present=false
    if [ -f "$slot_pid_file" ]; then
      pid_file_present=true
      slot_pid="$(cat "$slot_pid_file" 2>/dev/null || true)"
    fi
    if [[ "$slot_pid" =~ ^[0-9]+$ ]]; then
      # The classifier called this stale with a numeric pid: the owner is dead,
      # the pid was reused, or it is alive-but-unverifiable (no pid.start). Name
      # the shared owner verdict so the reason matches the classifier exactly.
      local owner_verdict
      owner_verdict="$(_owner_liveness "$slot_name")"
      # Distinguish a kept stack reaped PAST its TTL: a recorded project whose
      # containers are STILL RUNNING, yet liveness came back `stale` — the only
      # way that happens for a numeric-pid slot is the ISOLATE_KEEP_TTL
      # transition (a forgotten `--keep` leak). Emit a LOUD warning naming
      # project / age / TTL so the leak is visible, not a quiet info line.
      if [ "$has_proj" = true ]; then
        local running_containers=""
        running_containers="$(docker ps -q --filter "label=com.docker.compose.project=$slot_proj" 2>/dev/null || true)"
        if [ -n "$running_containers" ]; then
          local kept_age
          kept_age="$(_kept_slot_age "$slot_name")"
          [[ "$kept_age" =~ ^[0-9]+$ ]] || kept_age="?"
          warn "reaping kept stack '$slot_proj' (slot $slot_name): owner PID $slot_pid $owner_verdict, containers still running, age ${kept_age}s > keep TTL ${ISOLATE_KEEP_TTL}s — forgotten --keep leak"
          _reap_isolate_slot "$slot_entry" "$slot_proj"
          continue
        fi
      fi
      info "Attempting to reclaim stale slot $slot_name (PID $slot_pid owner $owner_verdict)"
      _reap_isolate_slot "$slot_entry" "$slot_proj"
      continue
    fi
    if [ "$has_proj" = true ] && [ "$pid_file_present" = false ]; then
      # Project recorded, no live containers, and no pid file at all — the
      # claim writes the pid file BEFORE the project record, so a missing pid
      # file means the owner state is genuinely gone. A pid file that EXISTS
      # but is empty/non-numeric is NOT the same thing: it may be a live owner
      # mid-build whose pid write was truncated — that case is INCONCLUSIVE
      # and falls through to the age fallback below instead of being reaped.
      info "Attempting to reclaim stale slot $slot_name (project $slot_proj has no live containers and no recorded owner)"
      _reap_isolate_slot "$slot_entry" "$slot_proj"
      continue
    fi
    # Fallback: age-based cleanup when the pid check is inconclusive (pid file
    # missing on a project-less legacy slot, or present-but-empty/non-numeric
    # contents on any slot). Capture the mtime with a
    # failure guard: a concurrent release can rm -rf the slot between our glob
    # and this stat, and an empty substitution inside $(( )) is a syntax error
    # that would kill the whole CLI under `set -e`. A vanished slot needs no
    # reaping — skip it.
    local slot_mtime
    slot_mtime="$(_file_mtime "$slot_entry")"
    [[ "$slot_mtime" =~ ^[0-9]+$ ]] || continue
    local slot_age
    slot_age=$(( $(date +%s) - slot_mtime ))
    if [ "$slot_age" -gt "$ISOLATE_STALE_THRESHOLD" ]; then
      # Surface WHY the pid check was inconclusive — it's the evidence that
      # routed this slot to the age fallback in the first place.
      local pid_evidence="no pid file"
      if [ "$pid_file_present" = true ]; then
        pid_evidence="pid file present but empty/non-numeric"
      fi
      info "Attempting to reclaim stale slot $slot_name (age ${slot_age}s > ${ISOLATE_STALE_THRESHOLD}s; owner-pid check inconclusive: $pid_evidence)"
      _reap_isolate_slot "$slot_entry" "$slot_proj"
    fi
  done
}

# Release the claimed isolation slot. The parent slots dir is deliberately
# LEFT IN PLACE: removing it here raced a concurrent claimer between its
# `mkdir -p` of the parent and its per-slot mkdir — every slot mkdir then
# failed ENOENT and the claimer died "No isolation slots available". An empty
# slots dir under XDG state is harmless.
_release_isolate_slot() {
  if [ -n "$ISOLATE_SLOT" ] && [ -d "$ISOLATE_SLOT_DIR/$ISOLATE_SLOT" ]; then
    rm -rf "$ISOLATE_SLOT_DIR/$ISOLATE_SLOT" 2>/dev/null || true
  fi
  ISOLATE_SLOT=""
}

# Print every host port that the given isolation slot will bind, one per line.
# Includes all slug ports from PORTS_FILE and the four infra base ports.
# Each output port = base + (slot+1)*200.
_slot_offset_ports() {
  local slot="${1:?slot required}"

  # Validate: must be a non-negative integer
  if ! printf '%s' "$slot" | grep -qE '^[0-9]+$'; then
    die "_slot_offset_ports: slot must be a non-negative integer, got: $slot"
  fi
  if [ "$slot" -gt "$ISOLATE_MAX_SLOT" ]; then
    die "_slot_offset_ports: slot $slot exceeds ISOLATE_MAX_SLOT ($ISOLATE_MAX_SLOT)"
  fi

  local offset=$(( (slot + 1) * 200 ))
  local infra_ports=(4010 8090 3210 8081)

  # Slug ports from PORTS_FILE
  local port_values
  if command -v jq &>/dev/null; then
    port_values="$(jq -r 'to_entries[] | .value' "$PORTS_FILE" 2>/dev/null)"
  else
    port_values="$(grep -o '"[^"]*"[[:space:]]*:[[:space:]]*[0-9]*' "$PORTS_FILE" | sed 's/.*:[[:space:]]*//')"
  fi

  while IFS= read -r base; do
    [ -z "$base" ] && continue
    printf '%d\n' $(( base + offset ))
  done <<< "$port_values"

  # Infra ports
  for base in "${infra_ports[@]}"; do
    printf '%d\n' $(( base + offset ))
  done
}

# _slot_ports_free <slot> [precomputed_liveness] — probe every port the slot
# would bind for non-self listeners. Returns 0 if all ports are free (or only
# held by this slot's own compose project), 1 if any port is held by a foreign
# process. Emits one `info` line per held port. Requires lsof (matches
# cmd-doctor.sh convention).
#
# A caller that has ALREADY computed the slot's liveness (e.g. _slot_state,
# which probes it once and reuses the value) may pass it as the second arg to
# avoid a redundant docker-ps round-trip; an empty/absent second arg falls back
# to a lazy on-demand probe.
_slot_ports_free() {
  local slot="${1:?slot required}"
  local precomputed_liveness="${2:-}"
  if ! command -v lsof &>/dev/null; then
    die "--isolate requires lsof; install it"
  fi

  local slot_proj=""
  local slot_proj_file="$ISOLATE_SLOT_DIR/$slot/project"
  if [ -f "$slot_proj_file" ]; then
    slot_proj="$(cat "$slot_proj_file" 2>/dev/null || true)"
  fi

  # Honor a non-empty precomputed value so liveness is probed at most once per
  # slot; otherwise leave empty and lazily probe on first need below.
  local liveness="$precomputed_liveness"
  local any_held=0
  local port

  # Capture the slot's port list BEFORE the loop so _slot_offset_ports's exit
  # status reaches us. Consuming it inline via `done < <(_slot_offset_ports ...)`
  # ran _slot_offset_ports in a process-substitution SUBSHELL: a `die` on a bad
  # slot (out-of-range / non-numeric) exited only that subshell, the loop read
  # zero ports, any_held stayed 0, and we returned 0 ("all free") — silently
  # defeating the port-conflict guard for a bad slot. With command substitution
  # the die propagates the failing exit status; `|| die` re-raises it loudly so
  # both claim paths see an error, never a false "free".
  local ports
  ports="$(_slot_offset_ports "$slot")" \
    || die "_slot_ports_free: could not enumerate ports for slot $slot"

  while IFS= read -r port; do
    [ -z "$port" ] && continue
    local listeners
    listeners="$(lsof -i :"$port" -sTCP:LISTEN -P -n 2>/dev/null | tail -n +2 || true)"
    [ -z "$listeners" ] && continue

    local line
    while IFS= read -r line; do
      [ -z "$line" ] && continue
      local proc_name
      proc_name="$(printf '%s\n' "$line" | awk '{print $1}')"
      # Own-project filter: a docker/com.docker listener on a slot whose own
      # compose project is recorded and either `live` (live verified owner) OR
      # `kept` (running containers, owner gone/unprovable — a --keep'd stack) is
      # the slot's OWN binding, not a foreign hold. `kept` MUST be accepted here
      # too: with the new vocabulary a kept stack returns `kept`, and without
      # this a subsequent pinned/auto claim onto it would see its own
      # containers' ports as foreign and die "ports are held by a foreign
      # process".
      #
      # The `com\.docke` alternative matches macOS lsof's 9-char COMMAND
      # truncation of `com.docker.vmnetd`/`com.docker.backend` to `com.docke`
      # (the full names never fit the column) — without it the own-project
      # filter silently never fired on macOS and a kept stack's own published
      # port read as a foreign hold. `Python`/other names still do not match.
      if printf '%s' "$proc_name" | grep -qiE 'docker|com\.docke'; then
        if [ -n "$slot_proj" ]; then
          if [ -z "$liveness" ]; then
            liveness="$(_slot_liveness "$slot")"
          fi
          if [ "$liveness" = "live" ] || [ "$liveness" = "kept" ]; then
            continue
          fi
        fi
      fi
      info "Slot $slot port $port held by $proc_name"
      any_held=1
    done <<< "$listeners"
  done <<< "$ports"

  if [ "$any_held" -eq 0 ]; then
    return 0
  fi
  return 1
}

# _slot_state <slot> — emit one pipe-delimited line describing the slot:
#   slot|dir|pid|liveness|ports|offset|project
# Always exits 0. For an absent slot dir, ports is "-" (no probe) to keep the
# `bin/showcase slots` table tidy.
_slot_state() {
  local slot="${1:?slot required}"
  local slot_entry="$ISOLATE_SLOT_DIR/$slot"

  local dir="absent"
  [ -d "$slot_entry" ] && dir="present"

  # PID annotation derived from the SHARED _owner_liveness helper so the
  # table's render can never diverge from the classifier's verdict. The four
  # owner outputs map to exactly three render tokens:
  #   alive                  → <pid>          (start-time-verified our owner)
  #   reused                 → <pid>(reused)  (start-time mismatch — recycled)
  #   dead | unverifiable    → <pid>(dead)    (ESRCH/EPERM, or no pid.start)
  # `absent` (no numeric pid) keeps the bare "-". A `(dead)` annotation can
  # accompany EITHER LIVE=kept (dead owner + running containers) or LIVE=stale
  # (dead owner + no containers).
  local pid="-"
  if [ -f "$slot_entry/pid" ]; then
    local raw_pid
    raw_pid="$(cat "$slot_entry/pid" 2>/dev/null || true)"
    if [[ "$raw_pid" =~ ^[0-9]+$ ]]; then
      local owner
      owner="$(_owner_liveness "$slot")"
      case "$owner" in
        alive)            pid="$raw_pid" ;;
        reused)           pid="${raw_pid}(reused)" ;;
        dead|unverifiable) pid="${raw_pid}(dead)" ;;
        *)                pid="$raw_pid" ;;
      esac
    fi
  fi

  local project="-"
  if [ -f "$slot_entry/project" ]; then
    local raw_proj
    raw_proj="$(cat "$slot_entry/project" 2>/dev/null || true)"
    if [ -n "$raw_proj" ]; then
      project="$raw_proj"
    fi
  fi