forked from CopilotKit/CopilotKit
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathserver.ts
More file actions
180 lines (172 loc) · 7.95 KB
/
Copy pathserver.ts
File metadata and controls
180 lines (172 loc) · 7.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import { Hono } from "hono";
import type { PbClient } from "../storage/pb-client.js";
import type { Logger } from "../types/index.js";
import type { TypedEventBus } from "../events/event-bus.js";
import type { HarnessRole } from "../fleet/role-config.js";
import { registerDeployWebhook } from "./webhooks/deploy.js";
import { registerProbesRoutes, type ProbesRouteDeps } from "./probes.js";
import { renderPrometheus, type MetricsRegistry } from "./metrics.js";
export interface ServerDeps {
pb: PbClient;
logger: Logger;
/**
* Service role this /health surface represents. Defaults to "worker"
* (the legacy in-process harness), for which probe rules are the unit of
* work — so /health requires `ruleCount > 0` (a running server with zero
* rules means the rule loader silently failed). The "control-plane" role
* is a scheduler/queue/aggregator that legitimately owns NO probe rules
* (only the single fleet-job-producer scheduler entry), so for it the
* `rules > 0` gate is dropped — liveness is governed by pb + the scheduler
* signals (`schedulerJobCount`, `schedulerStarted`, `loopAlive`) instead.
* Without this, the control-plane container reports degraded/503 forever
* (rules is always 0) and Railway restart-loops it.
*
* Typed via the `HarnessRole` SSOT (fleet/role-config.ts) so a future role
* addition flows here automatically rather than drifting from an inline
* literal union.
*/
role?: HarnessRole;
ruleCount: () => number;
/**
* Historically exposed as `loop: ok|stopped` on /health, but the flag
* only reflected whether `orchestrator.stop()` had been called — it
* never reflected actual scheduler/probe-loop liveness. Kept as an
* optional knob for backwards compatibility; when absent the /health
* response omits the `loop` field rather than lying about it.
*/
loopAlive?: () => boolean;
/**
* Callback returning `true` once the scheduler has been started and is
* actively running. When supplied, /health's `loop` field reflects
* `schedulerStarted && loopAlive` instead of the weaker `loopAlive`
* alone — this prevents the endpoint reporting `loop: ok` during the
* narrow boot window between server-listen and scheduler-start where a
* crashed scheduler otherwise stays invisible.
*/
schedulerStarted?: () => boolean;
/**
* Number of entries currently registered with the scheduler. /health
* treats zero as a hard 503 — a running HTTP server with no cron jobs
* means the rule loader silently crashed (or loaded zero rules) and no
* probes will tick. REQUIRED (fail-loud): the previous optional
* signature defaulted to "OK by default" when callers forgot to wire
* the callback, masking exactly the misconfiguration this signal exists
* to surface. Production wires it in `boot()` (orchestrator.ts); test
* harnesses must supply a stub (e.g. `() => 1`).
*/
schedulerJobCount: () => number;
/**
* `true` once `scheduler.stop()` has completed. When supplied, /health
* returns 503 with `loop: "stopped"` rather than relying on the weaker
* `loopAlive` signal alone, which closes the post-shutdown window
* where /health can otherwise report healthy for a few seconds after
* stop() is called.
*/
schedulerIsStopped?: () => boolean;
/** Event bus for webhook emissions. Optional so older callers (tests) don't break. */
bus?: TypedEventBus;
/** HMAC secrets for signed webhooks. If unset, webhook routes are not registered. */
webhookSecrets?: string[];
/** Metrics registry. When provided, `/metrics` returns Prometheus text. */
metrics?: MetricsRegistry;
/**
* Optional `/api/probes` wiring. When supplied, the three probe routes
* (list / detail / trigger) are mounted; absent, the routes return
* Hono's default 404 so older test setups that only need `/health`
* don't have to thread the full scheduler through. The orchestrator
* always supplies this in production.
*/
probes?: ProbesRouteDeps;
}
export function buildServer(deps: ServerDeps): Hono {
// Fail-loud guard: the type signature already requires
// `schedulerJobCount`, but callers compiled with looser settings (or
// dynamic call sites built from `unknown`/`any`) can still pass
// undefined at runtime. Throwing here is preferable to falling back to
// a default-OK — that's exactly the misconfiguration that previously
// shipped /health: 200 with zero cron jobs.
if (typeof deps.schedulerJobCount !== "function") {
throw new Error(
"buildServer: schedulerJobCount callback is required. " +
"Wire it from the scheduler (e.g. () => scheduler.getJobCount()) " +
"so /health can fail loud when the rule loader produces zero entries.",
);
}
const app = new Hono();
if (deps.bus && deps.webhookSecrets && deps.webhookSecrets.length > 0) {
registerDeployWebhook(app, {
bus: deps.bus,
logger: deps.logger,
secrets: deps.webhookSecrets,
metrics: deps.metrics,
});
}
if (deps.probes) {
registerProbesRoutes(app, deps.probes);
}
if (deps.metrics) {
const registry = deps.metrics;
// NOTE: `/metrics` is intentionally unauthenticated so in-cluster
// Prometheus scrapers can reach it without credential plumbing. If
// this service is ever exposed directly to the public internet, this
// route leaks internal counters (probe cadence, alert volume, HMAC
// failure rate) and must be locked down (e.g. private network ACL,
// reverse-proxy basic auth, or token-based auth). Tracked as a
// hardening item post-v1 rather than a default; until then, operators
// must keep this service behind Railway's private network.
app.get("/metrics", (c) => {
const body = renderPrometheus(registry);
return c.body(body, 200, { "Content-Type": "text/plain; version=0.0.4" });
});
}
app.get("/health", async (c) => {
const pbOk = await deps.pb.health();
const ruleCount = deps.ruleCount();
// Loop-alive semantics:
// - `schedulerStarted` (optional): true once start() returned.
// - `schedulerIsStopped` (optional): true once stop() completed —
// takes priority over `loopAlive` so post-shutdown responses are
// accurate.
// - `schedulerJobCount` (REQUIRED): if zero, /health returns 503.
// An HTTP server up with no cron entries means the scheduler is
// ticking nothing — a silent outage we previously reported as
// healthy. Fail-loud: callback is required at the type level and
// guarded at boot, so jobCount is always defined here.
// - `loopAlive`: legacy flag flipped by orchestrator.stop().
// Order: stopped > !started > !alive > jobCount==0 > alive.
const alive = deps.loopAlive?.() ?? true;
const started = deps.schedulerStarted?.() ?? true;
const schedulerStopped = deps.schedulerIsStopped?.() ?? false;
const jobCount = deps.schedulerJobCount();
const jobCountOk = jobCount > 0;
const loopOk = !schedulerStopped && started && alive && jobCountOk;
const loopLabel = schedulerStopped
? "stopped"
: !started
? "starting"
: !alive
? "stopped"
: !jobCountOk
? "no-jobs"
: "ok";
// Role-aware rules gate: the worker (default) role treats probe rules as
// its unit of work, so zero rules is a hard 503 (rule-loader crashed).
// The control-plane owns no probe rules — its liveness is the scheduler
// signals already folded into `loopOk` (schedulerJobCount>0 covers the
// fleet-job-producer entry) — so it must not require rules>0, or it would
// report degraded forever and Railway would restart-loop it.
const rulesOk = deps.role === "control-plane" ? true : ruleCount > 0;
const ok = pbOk && loopOk && rulesOk;
return c.json(
{
status: ok ? "ok" : "degraded",
pb: pbOk ? "ok" : "down",
loop: loopLabel,
rules: ruleCount,
schedulerJobs: jobCount,
},
ok ? 200 : 503,
);
});
return app;
}