forked from CopilotKit/CopilotKit
-
Notifications
You must be signed in to change notification settings - Fork 0
370 lines (355 loc) · 17.8 KB
/
Copy pathshowcase_deploy.yml
File metadata and controls
370 lines (355 loc) · 17.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
name: "Showcase: Verify Deploy"
# Triggered after "Showcase: Build & Push" completes. Verifies that the
# STAGING redeploy from the build workflow actually produced healthy
# services. Push-to-main redeploys staging only. This workflow is the
# staging gate; verify-deploy.ts is the parameterized probe driven off
# showcase/scripts/railway-envs.ts (the SSOT).
on:
workflow_run:
workflows: ["Showcase: Build & Push"]
types: [completed]
branches: [main]
workflow_dispatch:
inputs:
service:
description: "Service to verify (SSOT key or dispatch_name; 'all' = everything probe-eligible)"
required: false
default: "all"
type: string
concurrency:
group: showcase-verify-deploy
cancel-in-progress: true
permissions:
contents: read
jobs:
resolve-matrix:
runs-on: ubuntu-latest
timeout-minutes: 5
permissions:
contents: read
actions: read
if: >-
github.event_name == 'workflow_dispatch' ||
github.event.workflow_run.conclusion == 'success'
outputs:
services_csv: ${{ steps.matrix.outputs.services_csv }}
has_services: ${{ steps.matrix.outputs.has_services }}
build_run_id: ${{ github.event.workflow_run.id }}
build_run_url: ${{ github.event.workflow_run.html_url }}
redeploy_red: ${{ steps.redeploy-gate.outputs.redeploy_red }}
ok_services: ${{ steps.redeploy-gate.outputs.ok_services }}
failed_services: ${{ steps.redeploy-gate.outputs.failed_services }}
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
with:
persist-credentials: false
- uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
with:
node-version: 22.x
- name: Check whether build uploaded a redeploy-summary artifact
id: check-redeploy-summary
# `actions/download-artifact@v4` with a `name:` HARD-FAILS when the
# named artifact does not exist. The artifact legitimately does
# NOT exist whenever the upstream build ran but redeployed nothing
# — e.g. a push touching `showcase/**` that fires the build's
# `paths:` filter, but `detect-changes` finds no buildable service
# changed, so `redeploy-staging` is skipped and never uploads.
# The build still concludes `success`, so this workflow fires on
# `workflow_run` and `resolve-matrix` runs. Without this pre-check
# the unguarded download would fail the job and (via
# `enforce-redeploy-gate` tripping on `result == 'failure'`) flip
# the whole deploy workflow RED — a false-red on a routine
# showcase-docs/script-only change. Listing artifacts via the API
# only requires `actions: read`, which `resolve-matrix` already has.
if: github.event_name == 'workflow_run'
uses: actions/github-script@f28e40c7f34bde8b3046d885e986cb6290c5673b # v7
with:
script: |
// Fail loud on API error: github-script propagates unhandled
// rejections, which fails this step and (via the
// resolve-matrix.result == 'failure' clause on
// enforce-redeploy-gate) reds the workflow. Do NOT wrap this
// in try/catch — silently defaulting summary_present=false on
// a 5xx/403 would open the gate (skip verify) on what is
// actually a transient API failure, hiding a broken pipeline.
//
// Use the `name` query-param on listWorkflowRunArtifacts to
// ask the API to return only the redeploy-summary artifact.
// This makes the lookup robust to the build run uploading
// many artifacts (per-slot build-result-* + build-results +
// redeploy-summary — already ~28 today, well within per_page
// 100, but a future expansion past 100 would otherwise risk a
// false "absent" if redeploy-summary fell off the first page).
// The endpoint accepts `name` for an exact-match filter; we
// still paginate defensively in case the API returns multiple
// rows (e.g. an artifact with the same name re-uploaded).
const runId = context.payload.workflow_run.id;
const iterator = github.paginate.iterator(
github.rest.actions.listWorkflowRunArtifacts,
{
owner: context.repo.owner,
repo: context.repo.repo,
run_id: runId,
name: "redeploy-summary",
per_page: 100,
},
);
let present = false;
for await (const page of iterator) {
if ((page.data || []).some((a) => a.name === "redeploy-summary")) {
present = true;
break;
}
}
core.setOutput("summary_present", present ? "true" : "false");
core.info(`redeploy-summary present for run ${runId}: ${present}`);
- name: Download redeploy summary from build workflow
# Three cases now handled distinctly:
# (a) workflow_dispatch — no `workflow_run` payload exists; the
# download is skipped and the bash `[ ! -f "$SUMMARY" ]`
# branch below treats it as "nothing to gate" (correct: a
# manual dispatch is not gated by a build's per-service set).
# (b) workflow_run + artifact absent — the upstream build
# redeployed nothing (e.g. no service had buildable
# changes); the precheck reports `summary_present=false`,
# the download is skipped, and the bash branch no-ops the
# gate. The deploy workflow should NOT red here — there is
# nothing to gate.
# (c) workflow_run + artifact PRESENT — we always attempt the
# download. We intentionally do NOT set
# `continue-on-error: true`: if the artifact exists but the
# download genuinely fails (network/permission), silently
# opening the gate would let verify probe the FULL service
# set against stale `:latest` and mask a broken redeploy as
# a green deploy. Fail loud instead.
if: >-
github.event_name == 'workflow_run' &&
steps.check-redeploy-summary.outputs.summary_present == 'true'
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4
with:
name: redeploy-summary
path: .redeploy
run-id: ${{ github.event.workflow_run.id }}
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Gate on staging redeploy errors
id: redeploy-gate
run: |
set -euo pipefail
SUMMARY=".redeploy/summary.json"
if [ ! -f "$SUMMARY" ]; then
echo "No redeploy summary found (workflow_dispatch path or build did not upload). Skipping gate."
{
echo "redeploy_red=false"
echo "ok_services="
echo "failed_services="
} >> "$GITHUB_OUTPUT"
exit 0
fi
# Shape guard: redeploy-env.ts writes per-entry `status` of
# exactly "ok" or "error". If the schema ever drifts (e.g.
# `status`→`state`, `ok`→`success`) every `select(.status==...)`
# silently yields empty → redeploy_red=false AND ok_services=""
# → resolve-verify-matrix skips verify on a real unverified
# redeploy = green CI on a broken release. Refuse the ambiguity:
# if the file has entries but ANY entry is missing a valid
# ok|error status (partial drift — some rows on the legacy schema,
# some on the new one), fail loud here so enforce-redeploy-gate
# reds the workflow (resolve-matrix.result == 'failure' fans into
# the gate). The previous TOTAL>0 && WITH_STATUS==0 check was
# all-or-nothing and silently dropped the drifted rows on a mixed
# summary.
TOTAL=$(jq 'length' "$SUMMARY")
WITH_STATUS=$(jq '[.[] | select(.status == "ok" or .status == "error")] | length' "$SUMMARY")
if [ "$TOTAL" -gt 0 ] && [ "$WITH_STATUS" -lt "$TOTAL" ]; then
echo "::error::summary.json shape drift: $WITH_STATUS of $TOTAL entries have status ok|error"
exit 1
fi
# Per spec §3: the workflow MUST turn red on any staging
# status:"error", while verify still runs against the success-set.
ERRORS=$(jq -c '[.[] | select(.status == "error")]' "$SUMMARY")
ERROR_COUNT=$(echo "$ERRORS" | jq 'length')
OK=$(jq -r '[.[] | select(.status == "ok") | .service] | join(",")' "$SUMMARY")
FAILED=$(jq -r '[.[] | select(.status == "error") | .service] | join(",")' "$SUMMARY")
echo "ok_services=$OK" >> "$GITHUB_OUTPUT"
echo "failed_services=$FAILED" >> "$GITHUB_OUTPUT"
if [ "$ERROR_COUNT" -gt 0 ]; then
echo "::error::Staging redeploy reported $ERROR_COUNT per-service error(s):"
echo "$ERRORS" | jq -r '.[] | " - \(.service): \(.error)"'
echo "redeploy_red=true" >> "$GITHUB_OUTPUT"
else
echo "redeploy_red=false" >> "$GITHUB_OUTPUT"
fi
- name: Build verify matrix from SSOT
id: matrix
env:
DISPATCH_SERVICE: ${{ github.event.inputs.service }}
OK_FROM_REDEPLOY: ${{ steps.redeploy-gate.outputs.ok_services }}
EVENT_NAME: ${{ github.event_name }}
SUMMARY_PRESENT: ${{ steps.check-redeploy-summary.outputs.summary_present }}
# The decision-table that picks the verify matrix lives in
# showcase/scripts/resolve-verify-matrix.ts (pure function +
# unit tests). Summary of cases:
# - workflow_dispatch + 'all'/empty → full probe-eligible set.
# - workflow_dispatch + specific svc → just that service
# (unknown name → error exit).
# - workflow_run + summary_present=false → has_services=false
# (build redeployed nothing).
# - workflow_run + summary_present=true + ok empty
# → has_services=false (success-set empty; verify is skipped).
# In practice redeploy-env.ts only emits status ok|error,
# so this branch implies redeploy_red=true and
# enforce-redeploy-gate reds the workflow independently —
# skipping verify here is correct (no ok services left to
# probe; the gate has already turned the workflow red).
# - workflow_run + summary_present=true + ok non-empty
# → intersect ok_services (SSOT key OR dispatchName aliases)
# with probe.staging-eligible SSOT services. has_services
# reflects CSV emptiness. When the intersection collapses
# to empty, verify is skipped; if there were per-service
# errors, enforce-redeploy-gate reds the workflow — otherwise
# the run is correctly green (every redeploy succeeded, just
# none probe-eligible).
run: npx tsx showcase/scripts/resolve-verify-matrix.ts
verify:
needs: [resolve-matrix]
if: needs.resolve-matrix.outputs.has_services == 'true'
runs-on: ubuntu-latest
timeout-minutes: 20
environment: railway
permissions:
contents: read
actions: read
steps:
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
with:
persist-credentials: false
- uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.4.0
with:
node-version: 22.x
- name: Install
working-directory: showcase/scripts
run: npm ci
- name: Run verify-deploy --env staging
env:
RAILWAY_TOKEN: ${{ secrets.RAILWAY_TOKEN }}
SERVICES_CSV: ${{ needs.resolve-matrix.outputs.services_csv }}
run: |
if [ -z "$RAILWAY_TOKEN" ]; then
echo "::error::RAILWAY_TOKEN is not set"
exit 1
fi
npx tsx showcase/scripts/verify-deploy.ts --env staging --services "$SERVICES_CSV"
enforce-redeploy-gate:
# Spec §3: workflow turns red on any per-service redeploy error, even
# if verify against the success-set passes. This job is independent of
# verify so the user sees both signals (what was redeployed badly,
# what was redeployed and is unhealthy) rather than one masking the other.
needs: [resolve-matrix]
# Trip the gate on EITHER a per-service redeploy error OR a complete
# resolve-matrix failure. A resolve-matrix failure leaves the
# `redeploy_red` output empty (jobs that fail mid-step don't publish
# outputs reliably), which would otherwise let an upstream crash slip
# past as "not red" — a silent bypass of the gate.
if: always() && (needs.resolve-matrix.outputs.redeploy_red == 'true' || needs.resolve-matrix.result == 'failure')
runs-on: ubuntu-latest
timeout-minutes: 2
permissions:
contents: read
steps:
- name: Fail workflow on staging redeploy errors
run: |
echo "::error::One or more staging services reported status:error in the redeploy summary."
echo "See the resolve-matrix job's 'Gate on staging redeploy errors' step for details."
exit 1
notify-harness:
needs: [resolve-matrix, verify, enforce-redeploy-gate]
if: always() && needs.resolve-matrix.outputs.has_services == 'true'
permissions:
contents: read
actions: read
runs-on: ubuntu-latest
timeout-minutes: 3
steps:
- name: Compute deploy-result payload
id: payload
env:
VERIFY_RESULT: ${{ needs.verify.result }}
OK_SERVICES: ${{ needs.resolve-matrix.outputs.ok_services }}
FAILED_SERVICES: ${{ needs.resolve-matrix.outputs.failed_services }}
BUILD_RUN_ID: ${{ needs.resolve-matrix.outputs.build_run_id }}
BUILD_RUN_URL: ${{ needs.resolve-matrix.outputs.build_run_url }}
RUN_ID: ${{ github.run_id }}
RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
# Build the payload to match the harness ingest schema at
# showcase/harness/src/http/webhooks/deploy.ts (`.strict()`):
# {runId, runUrl, buildRunId?, buildRunUrl?, services[], succeeded[], failed[], cancelled}
# No `state` key — the schema rejects unknown fields. State is
# derived downstream from (failed.length, cancelled, succeeded).
# - succeeded = ok_services from the redeploy gate
# - failed = failed_services from the redeploy gate
# - services = union (full attempted set, per deploy.ts:307)
# - cancelled = verify job conclusion == "cancelled"
# jq -R/split handles empty CSV → [] safely.
run: |
set -euo pipefail
# `echo` (not printf '%s') so empty CSV → newline → jq -R reads
# an empty record → split produces [""] → filter to []. Without
# the trailing newline jq sees no records and --argjson barfs
# on the empty string.
SUCCEEDED=$(echo "${OK_SERVICES:-}" | jq -R 'split(",") | map(select(length>0))')
FAILED=$(echo "${FAILED_SERVICES:-}" | jq -R 'split(",") | map(select(length>0))')
if [ "$VERIFY_RESULT" = "cancelled" ]; then
CANCELLED="true"
else
CANCELLED="false"
fi
# buildRunId/buildRunUrl are .url()-validated in the harness
# Zod schema, so we must omit them entirely when empty rather
# than emit "" (which fails .url()). Same for runUrl: only
# emit it if non-empty (it's optional in the schema). The base
# object always carries runId + succeeded/failed/services/cancelled;
# we conditionally splice in the optional keys.
PAYLOAD=$(jq -cn \
--arg runId "$RUN_ID" --arg runUrl "${RUN_URL:-}" \
--arg buildRunId "${BUILD_RUN_ID:-}" --arg buildRunUrl "${BUILD_RUN_URL:-}" \
--argjson succeeded "$SUCCEEDED" \
--argjson failed "$FAILED" \
--argjson cancelled "$CANCELLED" \
'
{
runId: $runId,
services: ($succeeded + $failed | unique),
succeeded: $succeeded,
failed: $failed,
cancelled: $cancelled
}
+ (if $runUrl == "" then {} else {runUrl: $runUrl} end)
+ (if $buildRunId == "" then {} else {buildRunId: $buildRunId} end)
+ (if $buildRunUrl == "" then {} else {buildRunUrl: $buildRunUrl} end)
')
{
echo "payload<<EOF_PAYLOAD"
echo "$PAYLOAD"
echo "EOF_PAYLOAD"
} >> "$GITHUB_OUTPUT"
- name: POST deploy result to showcase-harness
env:
SHOWCASE_HARNESS_URL: ${{ secrets.SHOWCASE_HARNESS_URL }}
SHARED_SECRET: ${{ secrets.SHOWCASE_HARNESS_SHARED_SECRET }}
PAYLOAD: ${{ steps.payload.outputs.payload }}
run: |
set -euo pipefail
if [ -z "${SHOWCASE_HARNESS_URL:-}" ] || [ -z "${SHARED_SECRET:-}" ]; then
echo "::warning::SHOWCASE_HARNESS_URL or SHOWCASE_HARNESS_SHARED_SECRET not set; skipping webhook"
exit 0
fi
TS=$(date +%s)
BODY_SHA=$(printf '%s' "$PAYLOAD" | openssl dgst -sha256 -hex | awk '{print $2}')
CANONICAL="POST|/webhooks/deploy|${TS}|${BODY_SHA}"
SIG=$(printf '%s' "$CANONICAL" | openssl dgst -sha256 -hmac "$SHARED_SECRET" -hex | awk '{print $2}')
curl -sS --fail-with-body \
-X POST "${SHOWCASE_HARNESS_URL%/}/webhooks/deploy" \
-H 'content-type: application/json' \
-H "X-Ops-Timestamp: ${TS}" \
-H "X-Ops-Signature: sha256=${SIG}" \
--data-raw "$PAYLOAD"