CopilotKit/.github/workflows/showcase_validate.yml at main · samuelson-chen/CopilotKit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
name: "Showcase: Validate"

on:
  pull_request:
    paths:
      - "showcase/**"
      - "examples/integrations/**"
      - "package.json"
      - "pnpm-lock.yaml"
      - "pnpm-workspace.yaml"
      - ".github/workflows/showcase_validate.yml"
      - ".github/workflows/showcase_deploy.yml"
  push:
    branches: [main]
    paths:
      - "showcase/**"
      - "examples/integrations/**"
      - "package.json"
      - "pnpm-lock.yaml"
      - "pnpm-workspace.yaml"
      - ".github/workflows/showcase_validate.yml"
      - ".github/workflows/showcase_deploy.yml"

# Least-privilege by default. Individual jobs/steps can widen when needed.
permissions:
  contents: read

# Split concurrency per event so main-branch push runs are never canceled
# mid-execution (we need Slack failure alerts to fire reliably). PR runs
# still cancel in progress to keep PR CI responsive.
concurrency:
  group: showcase-validate-${{ github.ref }}-${{ github.event_name }}
  cancel-in-progress: ${{ github.event_name == 'pull_request' }}

jobs:
  validate:
    name: Validate Showcase
    # Hoist the Slack webhook into an env var so step-level `if:`
    # expressions can reference it — `secrets.*` is not a valid
    # named-value inside `if:` and causes a workflow startup failure
    # on push events.
    env:
      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_OSS_ALERTS }}
    # Depot (Startup plan, unlimited minutes) for persistent pnpm/npm
    # cache across runs — cold ubuntu-latest runs were ~18-20m; Depot
    # typically reduces to ~5-8m. 25m timeout retained as headroom.
    runs-on: depot-ubuntu-24.04-4
    timeout-minutes: 25
    permissions:
      contents: read
      # id-token: write is required for Depot OIDC auth (runs-on: depot-ubuntu-*).
      id-token: write
    defaults:
      run:
        # Pin shell so `set -euo pipefail` + `mapfile` behave the same
        # across any future runner image changes (default on ubuntu is
        # already bash, but we lock it explicitly).
        shell: bash

    steps:
      - name: Checkout
        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
        with:
          persist-credentials: false

      - name: Setup Node.js
        uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
        with:
          node-version: 22
          # Cache npm for the showcase/shell `npm ci` step below (shell is
          # NOT a pnpm workspace member; it ships its own package-lock.json).
          cache: "npm"
          cache-dependency-path: showcase/shell/package-lock.json

      - name: Setup pnpm
        # Pinned to a specific minor rather than floating @v4 so that a
        # silent upstream major/minor change can't alter install semantics
        # on a random CI run. Bump deliberately when refreshing the toolchain.
        uses: pnpm/action-setup@0e279bb959325dab635dd2c09392533439d90093 # v6.0.8

      - name: Verify lockfile is up to date
        run: pnpm install --frozen-lockfile --ignore-scripts

      - name: Enforce e2e spec count (baseline per package)
        run: |
          set -euo pipefail
          shopt -s nullglob
          # Single source of truth: showcase/scripts/fail-baseline.json
          # `baselineDemoCount` is read here AND by validate-parity.ts so the
          # per-package e2e-spec-count floor cannot drift between CI and the
          # validator. If parsing fails we distinguish JSON syntax errors
          # from schema failures (missing/non-integer/negative field).
          set +e
          MIN=$(node -e "
            let v;
            try {
              v = require('./showcase/scripts/fail-baseline.json');
            } catch (e) {
              console.error('fail-baseline.json: JSON syntax error: ' + e.message);
              process.exit(2);
            }
            const n = v.baselineDemoCount;
            if (typeof n !== 'number' || !Number.isInteger(n) || n < 0) {
              console.error('fail-baseline.json: schema failure: baselineDemoCount must be a non-negative integer');
              process.exit(3);
            }
            console.log(n);
          ")
          rc=$?
          set -e
          if [ "$rc" -ne 0 ]; then
            # Preserve node's distinct rc (2=JSON syntax, 3=schema) in the
            # annotation so the CI log pinpoints the cause without re-running.
            echo "::error::Failed to read baselineDemoCount from showcase/scripts/fail-baseline.json (node exit=$rc; 2=JSON syntax, 3=schema)"
            exit "$rc"
          fi
          failed=0
          found=0
          for pkg_dir in showcase/integrations/*/; do
            [ -d "$pkg_dir" ] || continue
            pkg=$(basename "$pkg_dir")
            # Skip manifest-only packages (no src/ directory) — these are
            # virtual/meta integrations (e.g. built-in-agent) that carry no
            # source code or demos and therefore have no e2e specs to enforce.
            if [ ! -d "${pkg_dir}src" ]; then
              echo "skip: $pkg (manifest-only, no src/)"
              continue
            fi
            found=$((found + 1))
            e2e_dir="${pkg_dir}tests/e2e/"
            if [ ! -d "$e2e_dir" ]; then
              echo "::error file=$pkg_dir::Package '$pkg' is missing tests/e2e/ directory (required for baseline e2e coverage)"
              failed=1
              continue
            fi
            # Capture `find` output into a variable first so we can check
            # its exit status directly. Bash process substitution (used with
            # `mapfile < <(cmd)`) does NOT propagate the producer's exit
            # status to the parent shell — `mapfile` only reports its own
            # usage errors — so a failing `find` (EACCES on a subdir, ELOOP,
            # transient I/O) would have been silently treated as "zero
            # specs" and surfaced as the misleading "minimum required"
            # error instead of the real root cause. Command substitution
            # propagates `find`'s status via `$?` on the assignment, which
            # we check immediately. A zero-spec result is a legitimate
            # success from `find` and is handled by the `$count -lt $MIN`
            # check below, not treated as a find failure.
            # Aggregate find failures with the rest of the per-package
            # failure modes (missing tests/e2e/, below-MIN count) so one bad
            # package doesn't short-circuit reporting for the others. A
            # single CI run should surface every problematic package at
            # once; `exit "$failed"` at the end of the loop reports the
            # aggregate.
            if ! find_out=$(find "$e2e_dir" -maxdepth 1 -type f -name '*.spec.ts'); then
              echo "::error file=$e2e_dir::find failed while enumerating specs for '$pkg'"
              failed=1
              continue
            fi
            specs=()
            # Only populate the array if `find` produced output; `mapfile
            # <<< ""` would otherwise create a single empty element and
            # inflate the count by one.
            if [ -n "$find_out" ]; then
              mapfile -t specs <<< "$find_out"
            fi
            count=${#specs[@]}
            if [ "$count" -lt "$MIN" ]; then
              echo "::error file=$e2e_dir::Package '$pkg' has $count e2e spec(s); minimum required is $MIN"
              failed=1
            else
              echo "ok: $pkg has $count spec(s)"
            fi
          done
          if [ "$found" -eq 0 ]; then
            echo "::error::No showcase/integrations/*/ directories found — baseline check cannot run"
            exit 1
          fi
          exit "$failed"

      - name: Run validate-parity (MUST checks gating)
        working-directory: showcase/scripts
        # MUST failures (missing manifest, missing src/app/demos dir) exit 1 and
        # fail the PR. SHOULD deviations print warnings and exit 0. See
        # showcase/scripts/validate-parity.ts for the full policy.
        #
        # `pnpm exec` resolves tsx from the pnpm-lock.yaml-pinned workspace
        # install; `npx tsx` could fetch a drifting version on a registry
        # cache miss.
        run: pnpm exec tsx validate-parity.ts

      - name: Run validate-fixture-tool-surface (aimock drift)
        working-directory: showcase/scripts
        # Cross-references every aimock fixture's returned tool-call names
        # against the tool surface of each demo whose suggestion prompt
        # contains the fixture's match substring. Catches the class of
        # drift that caused the 2026-04-22 regression where generic
        # substring matches (e.g. "pie chart") cross-fired across demos
        # with different tool surfaces, leaving the UI blank in prod.
        # See showcase/scripts/validate-fixture-tool-surface.ts and the
        # postmortem linked from there.
        run: pnpm exec tsx validate-fixture-tool-surface.ts

      - name: Run validate-pins (ratchet)
        working-directory: showcase/scripts
        # Ratchet gate on pin drift. Baseline (count + SHA-256 hash of sorted
        # unique FAIL lines) lives in `showcase/scripts/fail-baseline.json`;
        # see that file for the full ratchet semantics and adjustment
        # procedure. Weekly backlog visibility is provided by
        # `.github/workflows/showcase_drift-report.yml`. Driving the drift to
        # zero (and flipping this advisory ratchet to fully enforcing) is
        # future work.
        run: |
          set -euo pipefail

          # --- Load + validate baseline -----------------------------------
          # `node -e` prints either a validated value or an error marker
          # we match below. We deliberately do NOT let require() throw
          # out of the subshell; we format a clean CI error instead.
          #
          # We distinguish three failure modes with distinct exit codes so
          # the CI log pinpoints the cause without requiring a re-run:
          #   exit 2 => JSON syntax error (require() threw)
          #   exit 3 => schema failure (missing/wrong-typed required field)
          #   exit 4 => unexpected/unknown top-level field (typo guard)
          #
          # The unexpected-field check rejects silent typos like
          # `validatepinsfailcount` or an accidentally-added `comment`
          # field (distinct from the allowed leading underscore
          # `_comment`) that would otherwise leave required fields
          # undefined and be caught only via the schema branch with a
          # more confusing message.
          set +e
          baseline_json=$(node -e "
            const ALLOWED = ['_comment', 'validatePinsFailCount', 'validatePinsFailHash', 'baselineDemoCount'];
            let v;
            try {
              v = require('./fail-baseline.json');
            } catch (e) {
              console.error('fail-baseline.json: JSON syntax error: ' + e.message);
              process.exit(2);
            }
            const unexpected = Object.keys(v).filter(k => !ALLOWED.includes(k));
            if (unexpected.length > 0) {
              console.error('fail-baseline.json: unexpected field(s): ' + unexpected.join(', ') + '. Allowed fields: ' + ALLOWED.join(', '));
              process.exit(4);
            }
            const c = v.validatePinsFailCount;
            const h = v.validatePinsFailHash;
            if (typeof c !== 'number' || !Number.isInteger(c) || c < 0) {
              console.error('fail-baseline.json: schema failure: validatePinsFailCount must be a non-negative integer');
              process.exit(3);
            }
            if (typeof h !== 'string' || !/^[0-9a-f]{64}$/.test(h)) {
              console.error('fail-baseline.json: schema failure: validatePinsFailHash must be a 64-char lowercase hex SHA-256');
              process.exit(3);
            }
            console.log(JSON.stringify({ count: c, hash: h }));
          ")
          rc=$?
          set -e
          if [ "$rc" -ne 0 ]; then
            # Preserve node's distinct rc (2=JSON syntax, 3=schema, 4=unexpected field)
            # in the annotation so the CI log pinpoints the cause.
            echo "::error::fail-baseline.json failed validation (node exit=$rc; 2=JSON syntax, 3=schema, 4=unexpected field)"
            exit "$rc"
          fi
          baseline=$(node -e "console.log(JSON.parse(process.argv[1]).count)" "$baseline_json")
          baseline_hash=$(node -e "console.log(JSON.parse(process.argv[1]).hash)" "$baseline_json")

          # --- Run validator; separate internal crash from pin-drift exit -
          # validate-pins exits 0 when FAIL=0, 1 when FAIL>0. Anything else
          # (2+, uncaught throw, node crash, SIGSEGV) is an internal failure
          # we must surface distinctly from a legitimate drift report.
          #
          # We deliberately keep stdout and stderr in separate variables.
          # validate-pins.ts emits progress/summary on stdout and `[FAIL]`
          # lines on stderr; mingling them with `2>&1` allowed progress
          # chatter (or future stdout additions) to corrupt the hash input.
          # The hash is computed strictly from stderr.
          set +e
          stderr_file=$(mktemp)
          stdout=$(pnpm exec tsx validate-pins.ts 2>"$stderr_file")
          rc=$?
          stderr=$(cat "$stderr_file")
          rm -f "$stderr_file"
          set -e
          # Replay both streams to the job log so humans can debug.
          printf '%s\n' "$stdout"
          printf '%s\n' "$stderr" >&2
          if [ "$rc" -ne 0 ] && [ "$rc" -ne 1 ]; then
            # Preserve validate-pins.ts's distinct exit code (2=EXIT_INTERNAL,
            # 3=EXIT_UNREADABLE, 4+=future) so downstream consumers can
            # distinguish "validator crashed" from "pin drift found" (which
            # would be rc=1). Collapsing to `exit 1` would make an internal
            # crash indistinguishable from legitimate drift in the PR check
            # signal.
            echo "::error::validate-pins.ts exited with unexpected code $rc (expected 0 or 1). This indicates an internal failure, not pin drift."
            exit "$rc"
          fi

          # --- Parse Summary line (actual FAIL count) ---------------------
          # Summary line is on stdout. If the validator output format
          # changed (missing Summary, non-numeric FAIL), fail loudly
          # instead of silently treating it as zero.
          #
          # Scope grep's no-match tolerance to grep alone by wrapping just
          # the grep stage in a `{ ... || true; }` group. A trailing
          # `|| true` on the whole pipeline would defeat `pipefail` and
          # swallow producer/head failures too; we only want to tolerate
          # grep finding no match (which `[ -z "$summary_line" ]` below
          # already reports with a precise error).
          summary_line=$(printf '%s\n' "$stdout" | { grep -E '^[[:space:]]*Summary:' || true; } | head -n 1)
          if [ -z "$summary_line" ]; then
            echo "::error::Could not find validate-pins 'Summary:' line in output"
            exit 1
          fi
          # Word-boundary anchored to avoid matching e.g. `NEWFAIL=` or
          # `TOTALFAIL=` if such tokens are ever added to the Summary line.
          actual=$(printf '%s\n' "$summary_line" | grep -oE '\bFAIL=[0-9]+\b' | head -n 1 | cut -d= -f2)
          if [ -z "${actual:-}" ] || ! [[ "$actual" =~ ^[0-9]+$ ]]; then
            echo "::error::Could not parse FAIL=<int> from Summary line: $summary_line"
            exit 1
          fi

          # --- Compute tuple hash of current FAIL set ---------------------
          # Hash the sorted, deduplicated `[FAIL] ...` lines (stderr only).
          # This catches the "count equal but set drifted" case: one FAIL
          # healed while another regressed.
          #
          # Scope grep's no-match tolerance to grep alone by wrapping just
          # the grep stage in a `{ ... || true; }` group. A trailing
          # `|| true` on the whole pipeline would defeat `pipefail` and
          # swallow sort/shasum/cut failures too; clean runs with zero
          # `[FAIL]` lines must not be an error, so we tolerate grep's
          # no-match here and only here.
          actual_hash=$(printf '%s\n' "$stderr" | { grep -E '^\[FAIL\]' || true; } | LC_ALL=C sort -u | shasum -a 256 | cut -d' ' -f1)

          echo "validate-pins FAIL: actual=$actual baseline=$baseline"
          echo "validate-pins HASH: actual=$actual_hash baseline=$baseline_hash"

          if [ "$actual" -gt "$baseline" ]; then
            echo "::error::Pin drift increased: $actual FAIL(s) vs baseline $baseline. Fix the new drift or, with explicit sign-off, update showcase/scripts/fail-baseline.json (bump validatePinsFailCount to $actual and validatePinsFailHash to $actual_hash)."
            exit 1
          fi
          if [ "$actual" -lt "$baseline" ]; then
            echo "::error::Pin drift decreased: $actual FAIL(s) vs baseline $baseline. Ratchet down the baseline in showcase/scripts/fail-baseline.json (set validatePinsFailCount=$actual, validatePinsFailHash=$actual_hash)."
            exit 1
          fi
          if [ "$actual_hash" != "$baseline_hash" ]; then
            echo "::error::Pin drift SET changed (count equal at $actual, hash differs). One FAIL healed while another regressed — net zero on the counter but the failing tuples are not the same set. Update showcase/scripts/fail-baseline.json (validatePinsFailHash=$actual_hash) if this is intentional, or fix the new drift."
            echo "--- FAIL lines (current) ---"
            printf '%s\n' "$stderr" | grep -E '^\[FAIL\]' | LC_ALL=C sort -u
            exit 1
          fi
          echo "Pin drift unchanged at baseline ($baseline, hash $baseline_hash)."

      - name: Run build pipeline tests
        working-directory: showcase/scripts
        # Use pnpm to resolve the workspace-installed vitest (pinned via
        # pnpm-lock.yaml) rather than `npx`, which could fetch a different
        # version on a registry cache miss.
        run: pnpm exec vitest run

      - name: Validate manifests & generate registry
        working-directory: showcase/scripts
        run: pnpm exec tsx generate-registry.ts

      # ADVISORY ONLY — never fail the build. The promote dropdown is
      # self-healed by the lefthook pre-commit hook; this step only warns if a
      # commit somehow lands with a drifted showcase_promote.yml `service`
      # dropdown (e.g. hook skipped). The trailing `|| true` keeps a non-zero
      # `--check` exit from reddening validate.
      - name: Advisory — promote dropdown drift check
        working-directory: showcase/scripts
        run: |
          # ADVISORY ONLY: capture the exit code without letting a non-zero
          # `--check` redden the build. `|| true` alone would discard rc and
          # collapse every failure mode into the misleading "stale, re-run"
          # warning. sync-promote-service-options.ts exits:
          #   1 => drift (dropdown out of date; re-running the generator fixes it)
          #   2 => read error
          #   3 => missing/duplicate/malformed marker block (corruption)
          # Only rc=1 is actually self-heals-by-rerun; rc>=2 needs a human, and
          # the suggested re-run would itself fail — so report it distinctly.
          set +e
          pnpm exec tsx sync-promote-service-options.ts --check
          rc=$?
          set -e
          if [ "$rc" -eq 1 ]; then
            echo "::warning::showcase_promote.yml service dropdown is stale. Run: npx tsx showcase/scripts/sync-promote-service-options.ts and commit the result."
          elif [ "$rc" -ge 2 ]; then
            echo "::warning::sync-promote-service-options.ts failed (exit $rc) — marker block missing/duplicated or read error; investigate before trusting the dropdown."
          fi
          # Always succeed: this step must never fail the build (the lefthook
          # pre-commit hook self-heals; CI only warns).
          exit 0

      - name: Bundle demo content
        working-directory: showcase/scripts
        run: pnpm exec tsx bundle-demo-content.ts

      - name: Install showcase shell dependencies
        working-directory: showcase/shell
        # `showcase/shell` is NOT a pnpm workspace member (see pnpm-workspace.yaml)
        # and ships its own `package-lock.json`. Use `npm ci` to get a
        # reproducible install; `npm install` would re-resolve ranges.
        # npm cache is configured at the setup-node step above via
        # `cache-dependency-path: showcase/shell/package-lock.json`.
        run: npm ci --ignore-scripts

      - name: Build showcase shell
        working-directory: showcase/shell
        run: npm run build

      # NOTE: Slack failure alert only fires on `push` (i.e. main-branch
      # merges) by design. PR failures already surface in the PR checks UI
      # and the PR author's inbox, and we don't want PR-author noise
      # pinging the OSS alerts channel. Tradeoff: a broken PR that sneaks
      # past review won't alert Slack until after merge.
      #
      # Extract the failed step name and first meaningful error line so the
      # Slack payload is actionable at a glance rather than forcing a
      # click-through to the workflow run. Bare "X failed" alerts bury the
      # signal; red alerts must carry triage-ready detail per the oss-alerts
      # policy. Writes `failed_step` and `error_excerpt` to $GITHUB_ENV for
      # consumption by the notify step below.
      #
      # This step must NEVER fail the job (it runs on failure() already; a
      # crash here would compound the original failure with extraction
      # noise and could block the notify step). All extraction uses `|| true`
      # fallbacks so a malformed jobs response or truncated log still yields
      # sane defaults ("unknown" / "see workflow run for details").
      - name: Extract failure details for Slack
        id: extract
        if: failure() && github.event_name == 'push' && env.SLACK_WEBHOOK != ''
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          GH_REPO: ${{ github.repository }}
          RUN_ID: ${{ github.run_id }}
        run: |
          set +e  # best-effort: never block the notify step below

          # --- Find the currently-running job and its first failed step ---
          # The jobs API returns every job in the run. We identify *this*
          # job by name (matches `jobs.validate.name`) rather than
          # job.status=='in_progress', because at this point the step we're
          # running hasn't flipped the job state yet in the API. Fall back
          # to the first job with a failed step if the name match misses
          # (e.g. future rename drift).
          jobs_json=$(gh api "/repos/${GH_REPO}/actions/runs/${RUN_ID}/jobs" --paginate 2>/dev/null)
          job_id=$(printf '%s' "$jobs_json" | jq -r '
            .jobs // []
            | map(select(.name == "Validate Showcase"))
            | (.[0].id // empty)
          ' 2>/dev/null)
          if [ -z "$job_id" ]; then
            job_id=$(printf '%s' "$jobs_json" | jq -r '
              .jobs // []
              | map(select(.steps // [] | map(.conclusion) | index("failure")))
              | (.[0].id // empty)
            ' 2>/dev/null)
          fi
          failed_step=$(printf '%s' "$jobs_json" | jq -r --arg id "$job_id" '
            .jobs // []
            | map(select((.id|tostring) == $id))
            | (.[0].steps // [])
            | map(select(.conclusion == "failure"))
            | (.[0].name // "unknown step")
          ' 2>/dev/null)
          [ -z "$failed_step" ] && failed_step="unknown step"

          # --- Pull log and extract first meaningful error line ------------
          # `gh run view --log-failed` output is TSV: job\tstep\ttimestamp + content.
          # Strip the three leading columns to get the raw step output, strip
          # ANSI escape codes, strip any stray BOM, skip runner/group/env
          # header noise, then grab the first line matching a recognised
          # error marker. Truncate to ~300 chars so the Slack payload stays
          # well under the 800-char budget even with escaping overhead.
          error_excerpt="see workflow run for details"
          if [ -n "$job_id" ]; then
            log_excerpt=$(gh run view "$RUN_ID" --repo "$GH_REPO" --log-failed --job="$job_id" 2>/dev/null \
              | awk -F'\t' 'NF>=3 { sub(/^[\xEF\xBB\xBF]?[0-9T:.\-Z ]+/, "", $3); print $3 }' \
              | sed 's/\x1b\[[0-9;]*[a-zA-Z]//g' \
              | grep -vE '^(##\[|shell: |env: |Run |[[:space:]]*$)' \
              | grep -m1 -E '^\[(FAIL|ERROR)\]|^Error:|^error:|^::error' \
              | head -c 300)
            if [ -n "$log_excerpt" ]; then
              error_excerpt="$log_excerpt"
            fi
          fi

          # --- Emit to $GITHUB_ENV using heredoc delimiter -----------------
          # Heredoc delimiter protects against values that contain `=` or
          # newlines breaking the KEY=VALUE format. The delimiter is a
          # long random-ish string unlikely to appear in any log line.
          {
            echo "failed_step<<EOF_FAILED_STEP_b3f2"
            printf '%s\n' "$failed_step"
            echo "EOF_FAILED_STEP_b3f2"
            echo "error_excerpt<<EOF_ERROR_EXCERPT_b3f2"
            printf '%s\n' "$error_excerpt"
            echo "EOF_ERROR_EXCERPT_b3f2"
          } >> "$GITHUB_ENV"

          exit 0  # belt-and-suspenders: never propagate a failure

      - name: Notify Slack (failure)
        if: failure() && github.event_name == 'push' && env.SLACK_WEBHOOK != ''
        uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52 # v2.1.0
        with:
          webhook: ${{ secrets.SLACK_WEBHOOK_OSS_ALERTS }}
          webhook-type: incoming-webhook
          # Defensive: wrap dynamic values via toJSON(format(...)) so that
          # if github.repository or the extracted failed_step / error_excerpt
          # contain characters that would break the JSON payload (quotes,
          # backslashes, newlines), the value is safely JSON-encoded instead
          # of injected as raw text. Matches the pattern used in
          # showcase_drift-report.yml. github.run_id is numeric so safe on
          # its own, but we wrap it for consistency and defense-in-depth.
          # env.failed_step and env.error_excerpt are populated by the
          # preceding "Extract failure details" step (with safe fallbacks if
          # extraction fails).
          payload: |
            { "text": ${{ toJSON(format(':x: *Showcase validate*: failed — {0}: {1} | <https://github.com/{2}/actions/runs/{3}|View run>', env.failed_step, env.error_excerpt, github.repository, github.run_id)) }} }

      - name: Log (no Slack — webhook unset)
        if: failure() && github.event_name == 'push' && env.SLACK_WEBHOOK == ''
        run: |
          echo "::warning::showcase_validate failed on push but SLACK_WEBHOOK_OSS_ALERTS is not set; no Slack notification sent."

  shell-script-tests:
    name: Shell script tests (bats + shellcheck)
    # Separate job (mirrors python-unit-tests) so the showcase shell-script
    # regression suite runs independently of the JS/TS validate job. Runs on
    # ubuntu-latest where shellcheck is preinstalled; bats is apt-installed.
    # These tests gate the promote-fleet.sh best-effort loop + succeeded_csv
    # export that the promote → verify-prod handoff depends on.
    runs-on: ubuntu-latest
    timeout-minutes: 5
    permissions:
      contents: read
    steps:
      - name: Checkout
        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
        with:
          persist-credentials: false

      - name: Install bats
        run: |
          sudo apt-get update
          sudo apt-get install -y bats

      - name: Shellcheck promote-fleet.sh
        # shellcheck is preinstalled on ubuntu-latest.
        run: shellcheck showcase/scripts/promote-fleet.sh

      - name: Run bats suite
        run: bats showcase/scripts/__tests__/

  python-unit-tests:
    name: Python unit tests (${{ matrix.python-version }})
    # Separate job so pre-existing `validate-parity` failures don't mask new
    # Python unit-test regressions. pytest runs independently of JS/TS checks.
    runs-on: ubuntu-latest
    timeout-minutes: 10
    permissions:
      contents: read
    strategy:
      # Fail-fast disabled so a 3.10-only regression (e.g. typing_extensions
      # fallback path breaking) doesn't cancel the 3.12 run and leave us
      # guessing which version is the actual problem.
      fail-fast: false
      matrix:
        # 3.10 covers the typing_extensions `NotRequired` fallback path used
        # by aimock_toggle.py (stdlib `NotRequired` only landed in 3.11).
        # 3.12 is the production/runner default. Pinning both guarantees we
        # catch a regression in either branch the first time it lands.
        python-version: ["3.10", "3.12"]
    steps:
      - name: Checkout
        uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
        with:
          persist-credentials: false

      - name: Setup Python
        uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
        with:
          python-version: ${{ matrix.python-version }}
          cache: "pip"
          cache-dependency-path: |
            showcase/integrations/*/requirements.txt

      - name: Install minimal test deps
        # Always need pytest + typing_extensions. pytest-asyncio is required
        # by langroid's test_agui_adapter.py (16 tests use
        # `@pytest.mark.asyncio`); without it, pytest reports
        # "async def functions are not natively supported" and skips them.
        # pytest-mock is installed pre-emptively as it's commonly used by
        # showcase package tests and is cheap to install.
        # Per-package `requirements.txt` is installed inside the run loop
        # below so tests that import runtime deps (openai, google.genai,
        # httpx, opentelemetry, etc.) don't fail at collection time with
        # ModuleNotFoundError. Conftest-based stub finders can't help
        # because test_*.py imports the target deps BEFORE conftest runs.
        run: python -m pip install --quiet pytest pytest-asyncio pytest-mock typing_extensions

      - name: Run showcase package Python unit tests
        # Keep scope narrow: only showcase/integrations/*/tests/python/ directories
        # (not e2e, not langgraph which has its own runtime). Each package has
        # its own conftest.py that wires up import paths; we cd into the pkg
        # dir so those apply.
        #
        # Before running pytest in a package we install that package's own
        # `requirements.txt` (if present) so runtime-dep imports in test modules
        # resolve. Keeps CI parity with real runtime and avoids the fragile
        # stub-finder dance conftest.py would need to do otherwise.
        run: |
          set -euo pipefail
          failed=0
          found=0
          # Current interpreter major.minor (e.g. "3.10", "3.12"). Used
          # below to skip packages whose runtime deps are incompatible
          # with the matrix Python on this job.
          py_mm=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
          for pkg_dir in showcase/integrations/*/; do
            tests_dir="${pkg_dir}tests/python"
            [ -d "$tests_dir" ] || continue
            found=$((found + 1))
            pkg=$(basename "$pkg_dir")
            # --- Per-package Python-version gates -------------------------
            # Skip packages whose `requirements.txt` pins a dep whose
            # `requires-python` excludes this interpreter. Surgical skip
            # (not matrix exclusion) so the rest of the packages continue
            # to exercise the 3.10 typing_extensions fallback path.
            #
            # strands: ag_ui_strands==0.1.0 declares `requires-python >=3.12,<3.14`,
            # so `pip install` fails on 3.10 before pytest even runs.
            # langroid: tests import `typing.Self` (3.11+); on 3.10 the import fails
            # at collection time. typing_extensions.Self would fix it but the tests
            # are tightly coupled to the modern typing module.
            # Revisit when ag_ui_strands relaxes its floor or when 3.10 is dropped.
            if [ "$py_mm" = "3.10" ] && { [ "$pkg" = "strands" ] || [ "$pkg" = "langroid" ]; }; then
              echo "--- pytest: $pkg --- SKIPPED on Python $py_mm (requires >=3.11/3.12)"
              continue
            fi
            echo "--- pytest: $pkg ---"
            if [ -f "${pkg_dir}requirements.txt" ]; then
              echo "Installing ${pkg_dir}requirements.txt"
              python -m pip install --quiet -r "${pkg_dir}requirements.txt" || {
                echo "::error::pip install failed for $pkg"
                failed=1
                continue
              }
            fi
            # Export PYTHONPATH so `from tools import ...` in agent modules
            # resolves via the `tools` symlink at the integration root.
            # Also include src/ so `from agents.X import ...` works even if
            # a conftest.py omits the sys.path setup. Mirrors the local dev
            # convention (`PYTHONPATH=. python ...` in package.json scripts).
            (cd "$pkg_dir" && PYTHONPATH=".:src:${PYTHONPATH:-}" python -m pytest tests/python/ -v) || failed=1
          done
          if [ "$found" -eq 0 ]; then
            echo "::warning::No showcase/integrations/*/tests/python/ directories found"
          fi
          exit "$failed"

  notify:
    # Slack #oss-alerts on any red. Never #engr (engr is sacred — release alerts only).
    # Mirrors the workflow-level notify pattern in showcase_promote.yml so
    # red runs surface uniformly across the showcase pipeline. The validate
    # job has its own inline (and richer) push-only notifier that extracts
    # the failing step + error excerpt; this job is the workflow-level
    # safety net that also covers the python-unit-tests matrix job and the
    # shell-script-tests job — which otherwise had no Slack signal at all.
    # Webhook empty-guard mirrors promote.yml so an unset
    # SLACK_WEBHOOK_OSS_ALERTS secret does not break the shell or red the
    # workflow on this step.
    needs: [validate, python-unit-tests, shell-script-tests]
    if: always() && github.event_name == 'push'
    runs-on: ubuntu-latest
    timeout-minutes: 3
    permissions:
      contents: read
      actions: read
    env:
      SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK_OSS_ALERTS }}
    steps:
      - name: Compute state
        id: state
        env:
          VALIDATE: ${{ needs.validate.result }}
          PYTEST: ${{ needs.python-unit-tests.result }}
          SHELL: ${{ needs.shell-script-tests.result }}
        run: |
          set -euo pipefail
          if [ "$VALIDATE" = "success" ] && [ "$PYTEST" = "success" ] && [ "$SHELL" = "success" ]; then
            STATE="success"; ICON=":white_check_mark:"
          else
            STATE="failure"; ICON=":x:"
          fi
          {
            echo "state=$STATE"
            echo "icon=$ICON"
          } >> "$GITHUB_OUTPUT"
      - name: Post to #oss-alerts
        if: steps.state.outputs.state == 'failure' && env.SLACK_WEBHOOK != ''
        uses: slackapi/slack-github-action@b0fa283ad8fea605de13dc3f449259339835fc52 # v2.1.0
        with:
          webhook: ${{ secrets.SLACK_WEBHOOK_OSS_ALERTS }}
          webhook-type: incoming-webhook
          # Newlines are injected via fromJSON('"\n"') (a real LF char) as {8},
          # NOT a literal '\n' in the template: GitHub Actions expression string
          # literals do not interpret backslash escapes, so a literal '\n' would
          # survive toJSON as the two chars \\n and Slack would render it
          # verbatim as "\n" instead of a line break.
          payload: |
            {
              "text": ${{ toJSON(format(
                '{0} *showcase_validate failed on {1}*{8}validate={2} python-unit-tests={3} shell-script-tests={4}{8}<{5}/{6}/actions/runs/{7}|View run>',
                steps.state.outputs.icon,
                github.ref,
                needs.validate.result,
                needs.python-unit-tests.result,
                needs.shell-script-tests.result,
                github.server_url,
                github.repository,
                github.run_id,
                fromJSON('"\n"')
              )) }}
            }
      - name: Log (no Slack — webhook unset)
        if: steps.state.outputs.state == 'failure' && env.SLACK_WEBHOOK == ''
        env:
          REF: ${{ github.ref }}
        run: |
          echo "::warning::showcase_validate failed on $REF but SLACK_WEBHOOK_OSS_ALERTS is not set; no Slack notification sent."