copilot-api/scripts/proxy-model-validation.ts at master · tesseracode/copilot-api · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
#!/usr/bin/env bun
/**
 * Proxy Model Validation Suite
 *
 * Tests all models through 3 dimensions:
 *   1. Direct (upstream) — baseline, what the API actually supports
 *   2. Proxy /v1/messages — Anthropic-format input, proxy autoroutes
 *   3. Proxy /v1/chat/completions — OpenAI-format input, proxy autoroutes
 *
 * Also runs proxy-specific translation tests and capability probes.
 *
 * Usage:
 *   bun run scripts/proxy-model-validation.ts
 *   bun run scripts/proxy-model-validation.ts --quick           # text-only smoke
 *   bun run scripts/proxy-model-validation.ts --models claude   # filter by name
 *   bun run scripts/proxy-model-validation.ts --capabilities    # run capability probes
 *   bun run scripts/proxy-model-validation.ts --all             # everything
 *
 * Requires: proxy running on localhost:4141
 */

import { writeFile } from "fs/promises"
import { join } from "path"
import {
  type CopilotModel,
  type ModelProfile,
  type TestResult,
  getJwt,
  fetchModels,
  filterChatModels,
  classifyModel,
  copilotHeaders,
  proxyHeaders,
  fmtMs,
  COPILOT_API_BASE_URL,
  PROXY_URL,
} from "/Users/jbencardino/Documents/Proyectos/ccexplore/development-repo/scripts/lib/copilot-test-lib"

// ── CLI args ─────────────────────────────────────────────────────────────────

const args = new Set(process.argv.slice(2))
const quickMode = args.has("--quick")
const runCapabilities = args.has("--capabilities") || args.has("--all")
const modelFilter =
  process.argv.find((a) => a.startsWith("--models="))?.split("=")[1] ??
  (args.has("--models")
    ? process.argv[process.argv.indexOf("--models") + 1]
    : undefined)

// ── Types ────────────────────────────────────────────────────────────────────

interface BlameResult {
  model: string
  test: string
  direct: "pass" | "fail" | "skip"
  proxyMessages: "pass" | "fail" | "skip"
  proxyChat: "pass" | "fail" | "skip"
  blame: "ok" | "upstream" | "proxy-bug" | "proxy-fix"
  detail?: string
  durationMs: number
}

interface CapabilityProbe {
  model: string
  probe: string
  status: "pass" | "fail" | "skip"
  detail?: string
}

// ── Helpers ──────────────────────────────────────────────────────────────────

const PROXY_MSG_URL = PROXY_URL + "/v1/messages"
const PROXY_CHAT_URL = PROXY_URL + "/v1/chat/completions"

function chatHeaders(): Record<string, string> {
  return { "Content-Type": "application/json" }
}

async function safeFetch(
  url: string,
  opts: RequestInit,
): Promise<{ ok: boolean; status: number; body: string; json: unknown }> {
  try {
    const res = await fetch(url, { ...opts, signal: AbortSignal.timeout(30000) })
    const body = await res.text()
    let json: unknown
    try {
      json = JSON.parse(body)
    } catch {
      json = null
    }
    return { ok: res.ok, status: res.status, body, json }
  } catch (e) {
    return {
      ok: false,
      status: 0,
      body: String(e),
      json: null,
    }
  }
}

function extractText(json: unknown): string {
  const d = json as Record<string, unknown>
  // Anthropic
  const content = d?.content as Array<{ type: string; text?: string }> | undefined
  if (content?.[0]?.text) return content[0].text
  // OpenAI
  const choices = d?.choices as Array<{ message?: { content?: string } }> | undefined
  if (choices?.[0]?.message?.content) return choices[0].message.content
  // Responses
  const output = d?.output as Array<{ type: string; content?: Array<{ text?: string }> }> | undefined
  const msg = output?.find((o) => o.type === "message")
  if (msg?.content?.[0]?.text) return msg.content[0].text
  return ""
}

function hasToolUse(json: unknown): boolean {
  const d = json as Record<string, unknown>
  const content = d?.content as Array<{ type: string }> | undefined
  if (content?.some((c) => c.type === "tool_use")) return true
  const choices = d?.choices as Array<{ message?: { tool_calls?: unknown[] } }> | undefined
  if ((choices?.[0]?.message?.tool_calls?.length ?? 0) > 0) return true
  const output = d?.output as Array<{ type: string }> | undefined
  if (output?.some((o) => o.type === "function_call")) return true
  return false
}

// ── Test runners ─────────────────────────────────────────────────────────────

async function testText(
  profile: ModelProfile,
  url: string,
  headers: Record<string, string>,
  format: "anthropic" | "openai" | "native",
): Promise<TestResult> {
  const start = Date.now()
  let body: Record<string, unknown>

  if (format === "anthropic") {
    body = { model: profile.id, max_tokens: 16, messages: [{ role: "user", content: "Say PONG" }] }
  } else if (format === "native" && profile.endpoint === "/responses") {
    body = { model: profile.id, max_output_tokens: 16, input: [{ type: "message", role: "user", content: "Say PONG" }] }
  } else if (format === "native" && profile.endpoint === "/v1/messages") {
    body = { model: profile.id, max_tokens: 16, messages: [{ role: "user", content: "Say PONG" }] }
  } else {
    body = { model: profile.id, max_tokens: 16, messages: [{ role: "user", content: "Say PONG" }] }
  }

  const result = await safeFetch(url, { method: "POST", headers, body: JSON.stringify(body) })
  const text = extractText(result.json)
  const status = result.ok && text.length > 0 ? "pass" : "fail"
  return { test: "text", model: profile.id, endpoint: profile.endpoint, status, detail: status === "fail" ? `${result.status}: ${result.body.slice(0, 60)}` : undefined, durationMs: Date.now() - start }
}

async function testTools(
  profile: ModelProfile,
  url: string,
  headers: Record<string, string>,
  format: "anthropic" | "openai" | "native",
): Promise<TestResult> {
  if (!profile.toolSupport) return { test: "tools", model: profile.id, endpoint: profile.endpoint, status: "skip", detail: "no tool support", durationMs: 0 }
  const start = Date.now()

  const anthropicTool = { name: "ping", description: "Returns pong", input_schema: { type: "object", properties: { msg: { type: "string" } }, required: ["msg"] } }
  const openaiTool = { type: "function", function: { name: "ping", description: "Returns pong", parameters: { type: "object", properties: { msg: { type: "string" } }, required: ["msg"] } } }
  const responsesTool = { type: "function", name: "ping", description: "Returns pong", parameters: { type: "object", properties: { msg: { type: "string" } }, required: ["msg"] } }

  let body: Record<string, unknown>
  if (format === "anthropic") {
    body = { model: profile.id, max_tokens: 100, messages: [{ role: "user", content: 'Call the ping tool with msg "hello"' }], tools: [anthropicTool] }
  } else if (format === "native" && profile.endpoint === "/responses") {
    body = { model: profile.id, max_output_tokens: 100, input: [{ type: "message", role: "user", content: 'Call the ping tool with msg "hello"' }], tools: [responsesTool] }
  } else if (format === "native" && profile.endpoint === "/v1/messages") {
    body = { model: profile.id, max_tokens: 100, messages: [{ role: "user", content: 'Call the ping tool with msg "hello"' }], tools: [anthropicTool] }
  } else {
    body = { model: profile.id, max_tokens: 100, messages: [{ role: "user", content: 'Call the ping tool with msg "hello"' }], tools: [openaiTool] }
  }

  const result = await safeFetch(url, { method: "POST", headers, body: JSON.stringify(body) })
  const tools = result.ok && hasToolUse(result.json)
  return { test: "tools", model: profile.id, endpoint: profile.endpoint, status: tools ? "pass" : "fail", detail: tools ? undefined : `no tool_use (${result.status})`, durationMs: Date.now() - start }
}

async function testStream(
  profile: ModelProfile,
  url: string,
  headers: Record<string, string>,
  format: "anthropic" | "openai" | "native",
): Promise<TestResult> {
  const start = Date.now()
  let body: Record<string, unknown>

  if (format === "anthropic") {
    body = { model: profile.id, max_tokens: 16, messages: [{ role: "user", content: "Say hi" }], stream: true }
  } else if (format === "native" && profile.endpoint === "/responses") {
    body = { model: profile.id, max_output_tokens: 16, input: [{ type: "message", role: "user", content: "Say hi" }], stream: true }
  } else if (format === "native" && profile.endpoint === "/v1/messages") {
    body = { model: profile.id, max_tokens: 16, messages: [{ role: "user", content: "Say hi" }], stream: true }
  } else {
    body = { model: profile.id, max_tokens: 16, messages: [{ role: "user", content: "Say hi" }], stream: true }
  }

  const result = await safeFetch(url, { method: "POST", headers, body: JSON.stringify(body) })
  const hasEvents = result.body.includes("data:")
  return { test: "stream", model: profile.id, endpoint: profile.endpoint, status: hasEvents ? "pass" : "fail", detail: hasEvents ? undefined : `no SSE events (${result.status})`, durationMs: Date.now() - start }
}

// ── Direct endpoint URL resolver ─────────────────────────────────────────────

function directUrl(profile: ModelProfile): string {
  if (profile.endpoint === "/v1/messages") return COPILOT_API_BASE_URL + "/v1/messages"
  if (profile.endpoint === "/responses") return COPILOT_API_BASE_URL + "/responses"
  return COPILOT_API_BASE_URL + "/chat/completions"
}

function directFormat(profile: ModelProfile): "anthropic" | "openai" | "native" {
  if (profile.endpoint === "/v1/messages") return "native"
  if (profile.endpoint === "/responses") return "native"
  return "openai"
}

// ── Blame computation ────────────────────────────────────────────────────────

function computeBlame(direct: string, proxy: string): BlameResult["blame"] {
  if (direct === "pass" && proxy === "pass") return "ok"
  if (direct === "fail" && proxy === "fail") return "upstream"
  if (direct === "pass" && proxy === "fail") return "proxy-bug"
  if (direct === "fail" && proxy === "pass") return "proxy-fix"
  return "ok"
}

// ── Capability probes ────────────────────────────────────────────────────────

async function probeCapability(
  model: string,
  probe: string,
  url: string,
  headers: Record<string, string>,
  body: Record<string, unknown>,
): Promise<CapabilityProbe> {
  const result = await safeFetch(url, { method: "POST", headers, body: JSON.stringify(body) })
  const text = extractText(result.json)
  if (result.ok && text.length > 0) return { model, probe, status: "pass" }
  return { model, probe, status: "fail", detail: `${result.status}: ${result.body.slice(0, 60)}` }
}

async function runCapabilityProbes(
  jwt: string,
  profiles: ModelProfile[],
): Promise<CapabilityProbe[]> {
  const results: CapabilityProbe[] = []
  const hdrs = copilotHeaders(jwt)

  // Claude models: thinking type probes
  const claudeModels = profiles.filter((p) => p.isClaude)
  for (const p of claudeModels) {
    const url = COPILOT_API_BASE_URL + "/v1/messages"
    const base = { model: p.id, max_tokens: 2048, messages: [{ role: "user", content: "Say PONG" }] }

    console.log(`  Probing ${p.id}...`)

    results.push(await probeCapability(p.id, "disabled", url, hdrs, { ...base, thinking: { type: "disabled" } }))
    results.push(await probeCapability(p.id, "enabled+budget", url, hdrs, { ...base, thinking: { type: "enabled", budget_tokens: 1024 } }))
    results.push(await probeCapability(p.id, "adaptive", url, hdrs, { ...base, thinking: { type: "adaptive" } }))
    results.push(await probeCapability(p.id, "adaptive+budget", url, hdrs, { ...base, thinking: { type: "adaptive", budget_tokens: 1024 } }))

    // Effort probes
    results.push(await probeCapability(p.id, "effort:low", url, hdrs, { ...base, output_config: { effort: "low" } }))
    results.push(await probeCapability(p.id, "effort:high", url, hdrs, { ...base, output_config: { effort: "high" } }))

    // Temperature
    results.push(await probeCapability(p.id, "temperature", url, hdrs, { ...base, temperature: 0.5 }))
  }

  // GPT-5.x models: /responses probes
  const gpt5Models = profiles.filter((p) => p.endpoint === "/responses")
  for (const p of gpt5Models) {
    const url = COPILOT_API_BASE_URL + "/responses"
    const base = { model: p.id, max_output_tokens: 256, input: [{ type: "message", role: "user", content: "Say PONG" }] }

    console.log(`  Probing ${p.id}...`)

    results.push(await probeCapability(p.id, "reasoning:high", url, hdrs, { ...base, reasoning: { effort: "high" } }))
    results.push(await probeCapability(p.id, "temperature", url, hdrs, { ...base, temperature: 0.5 }))

    // Check for reasoning block in output
    const reasonResp = await safeFetch(url, { method: "POST", headers: hdrs, body: JSON.stringify({ ...base, reasoning: { effort: "high" } }) })
    const output = (reasonResp.json as Record<string, unknown>)?.output as Array<{ type: string }> | undefined
    const hasReasoning = output?.some((o) => o.type === "reasoning") ?? false
    results.push({ model: p.id, probe: "reasoning-block", status: hasReasoning ? "pass" : "fail", detail: hasReasoning ? undefined : "no reasoning block in output" })
  }

  return results
}

// ── Proxy translation tests ──────────────────────────────────────────────────

async function runTranslationTests(): Promise<TestResult[]> {
  const results: TestResult[] = []
  const hdrs = proxyHeaders()

  // 1. Thinking normalization: adaptive → enabled for older models
  // Uses max_tokens: 4096 to ensure budget_tokens fits (budget must be >= 1024 and < max_tokens)
  {
    const start = Date.now()
    const res = await safeFetch(PROXY_MSG_URL, { method: "POST", headers: hdrs, body: JSON.stringify({ model: "claude-haiku-4-5", max_tokens: 4096, thinking: { type: "adaptive" }, messages: [{ role: "user", content: "Say PONG" }] }) })
    const text = extractText(res.json)
    results.push({ test: "thinking-downgrade (older)", model: "claude-haiku-4.5", endpoint: "/v1/messages", status: res.ok && text.length > 0 ? "pass" : "fail", detail: res.ok ? undefined : `${res.status}: ${res.body.slice(0, 60)}`, durationMs: Date.now() - start })
  }

  // 2. Effort → suffix for opus-4.7
  {
    const start = Date.now()
    const res = await safeFetch(PROXY_MSG_URL, { method: "POST", headers: hdrs, body: JSON.stringify({ model: "claude-opus-4-7", max_tokens: 256, output_config: { effort: "high" }, messages: [{ role: "user", content: "Say PONG" }] }) })
    const text = extractText(res.json)
    results.push({ test: "effort→suffix (4.7)", model: "claude-opus-4.7", endpoint: "/v1/messages", status: res.ok && text.length > 0 ? "pass" : "fail", detail: res.ok ? undefined : `${res.status}: ${res.body.slice(0, 60)}`, durationMs: Date.now() - start })
  }

  // 3. Effort → param for opus-4.6
  {
    const start = Date.now()
    const res = await safeFetch(PROXY_MSG_URL, { method: "POST", headers: hdrs, body: JSON.stringify({ model: "claude-opus-4-6", max_tokens: 256, output_config: { effort: "high" }, messages: [{ role: "user", content: "Say PONG" }] }) })
    const text = extractText(res.json)
    results.push({ test: "effort→param (4.6)", model: "claude-opus-4.6", endpoint: "/v1/messages", status: res.ok && text.length > 0 ? "pass" : "fail", detail: res.ok ? undefined : `${res.status}: ${res.body.slice(0, 60)}`, durationMs: Date.now() - start })
  }

  // 4. anthropic-beta 1M header
  {
    const start = Date.now()
    const h = { ...hdrs, "anthropic-beta": "context-1m-2025-08-07" }
    const res = await safeFetch(PROXY_MSG_URL, { method: "POST", headers: h, body: JSON.stringify({ model: "claude-opus-4-6", max_tokens: 64, messages: [{ role: "user", content: "Say PONG" }] }) })
    const text = extractText(res.json)
    results.push({ test: "1m-header-upgrade", model: "claude-opus-4.6", endpoint: "/v1/messages", status: res.ok && text.length > 0 ? "pass" : "fail", detail: res.ok ? undefined : `${res.status}: ${res.body.slice(0, 60)}`, durationMs: Date.now() - start })
  }

  // 5. GPT-5.5 via /v1/messages (responses-via-messages routing)
  {
    const start = Date.now()
    const res = await safeFetch(PROXY_MSG_URL, { method: "POST", headers: hdrs, body: JSON.stringify({ model: "gpt-5.5", max_tokens: 64, messages: [{ role: "user", content: "Say PONG" }] }) })
    const text = extractText(res.json)
    results.push({ test: "responses-via-messages", model: "gpt-5.5", endpoint: "/v1/messages", status: res.ok && text.length > 0 ? "pass" : "fail", detail: res.ok ? undefined : `${res.status}: ${res.body.slice(0, 60)}`, durationMs: Date.now() - start })
  }

  // 6. GPT-5.5 via /chat/completions (responses routing)
  {
    const start = Date.now()
    const res = await safeFetch(PROXY_CHAT_URL, { method: "POST", headers: chatHeaders(), body: JSON.stringify({ model: "gpt-5.5", max_tokens: 64, messages: [{ role: "user", content: "Say PONG" }] }) })
    const text = extractText(res.json)
    results.push({ test: "responses-via-chat", model: "gpt-5.5", endpoint: "/chat/completions", status: res.ok && text.length > 0 ? "pass" : "fail", detail: res.ok ? undefined : `${res.status}: ${res.body.slice(0, 60)}`, durationMs: Date.now() - start })
  }

  return results
}

// ── Report ───────────────────────────────────────────────────────────────────

function generateReport(
  models: CopilotModel[],
  profiles: ModelProfile[],
  blameResults: BlameResult[],
  translationResults: TestResult[],
  capabilityResults: CapabilityProbe[],
): string {
  const now = new Date().toISOString().split("T")[0]
  const allSmoke = blameResults.length
  const smokePass = blameResults.filter((r) => r.blame === "ok" || r.blame === "proxy-fix").length
  const proxyBugs = blameResults.filter((r) => r.blame === "proxy-bug").length
  const upstreamIssues = blameResults.filter((r) => r.blame === "upstream").length
  const transPass = translationResults.filter((r) => r.status === "pass").length
  const capPass = capabilityResults.filter((r) => r.status === "pass").length
  const totalTests = allSmoke + translationResults.length + capabilityResults.length

  const lines: string[] = [
    `# Proxy Model Validation Report — ${now}`,
    "",
    "## Summary",
    `- Models discovered: ${models.length}`,
    `- Models tested: ${profiles.length}`,
    `- Tests: ${allSmoke} smoke + ${translationResults.length} translation + ${capabilityResults.length} capability = ${totalTests} total`,
    `- Smoke: ${smokePass} ok, ${proxyBugs} proxy bugs, ${upstreamIssues} upstream limitations`,
    `- Translation: ${transPass}/${translationResults.length} pass`,
    `- Capability: ${capPass}/${capabilityResults.length} pass`,
    "",
    "## Model Profiles",
    "",
    "| Model | Endpoint | Tools | Thinking | Effort | Temp | Max Output |",
    "|-------|----------|-------|----------|--------|------|------------|",
  ]

  for (const p of profiles) {
    lines.push(`| ${p.id} | ${p.endpoint} | ${p.toolSupport ? "✅" : "❌"} | ${p.thinkingSupport} | ${p.effortSupport} | ${p.temperatureSupport ? "✅" : "❌"} | ${p.maxOutputTokens} |`)
  }

  // Smoke tests — blame matrix
  lines.push("", "## Smoke Tests", "")
  lines.push("| Model | Test | Direct | Proxy /msg | Proxy /chat | Blame |")
  lines.push("|-------|------|--------|-----------|-------------|-------|")

  for (const r of blameResults) {
    const icon = (s: string) => (s === "pass" ? "✅" : s === "skip" ? "⬜" : "❌")
    const blameIcon = r.blame === "ok" ? "✅" : r.blame === "upstream" ? "⚠️" : r.blame === "proxy-bug" ? "🐛" : "🤔"
    lines.push(`| ${r.model} | ${r.test} | ${icon(r.direct)} | ${icon(r.proxyMessages)} | ${icon(r.proxyChat)} | ${blameIcon} ${r.blame} |`)
  }

  // Translation tests
  lines.push("", "## Proxy Translation Tests", "")
  lines.push("| Test | Model | Status | Detail | Duration |")
  lines.push("|------|-------|--------|--------|----------|")
  for (const r of translationResults) {
    const icon = r.status === "pass" ? "✅" : "❌"
    lines.push(`| ${r.test} | ${r.model} | ${icon} | ${r.detail ?? ""} | ${fmtMs(r.durationMs)} |`)
  }

  // Capability matrix
  if (capabilityResults.length > 0) {
    lines.push("", "## Capability Matrix", "")
    const probeNames = [...new Set(capabilityResults.map((r) => r.probe))]
    lines.push("| Model | " + probeNames.join(" | ") + " |")
    lines.push("|-------|" + probeNames.map(() => "---").join("|") + "|")

    const modelIds = [...new Set(capabilityResults.map((r) => r.model))]
    for (const mid of modelIds) {
      const cells = probeNames.map((p) => {
        const r = capabilityResults.find((c) => c.model === mid && c.probe === p)
        if (!r) return "—"
        return r.status === "pass" ? "✅" : "❌"
      })
      lines.push(`| ${mid} | ${cells.join(" | ")} |`)
    }
  }

  // Failures
  const bugs = blameResults.filter((r) => r.blame === "proxy-bug")
  if (bugs.length > 0) {
    lines.push("", "## Proxy Bugs (pass direct, fail proxy)", "")
    for (const f of bugs) lines.push(`- **${f.model}** / ${f.test}: ${f.detail ?? "unknown"}`)
  }

  const upstream = blameResults.filter((r) => r.blame === "upstream")
  if (upstream.length > 0) {
    lines.push("", "## Upstream Limitations (fail both)", "")
    for (const f of upstream) lines.push(`- **${f.model}** / ${f.test}: ${f.detail ?? "unknown"}`)
  }

  lines.push("", "---", `Generated by proxy-model-validation.ts`)
  return lines.join("\n")
}

// ── Main ─────────────────────────────────────────────────────────────────────

async function main() {
  console.log("═══════════════════════════════════════════════════════════")
  console.log(" Proxy Model Validation Suite")
  console.log("═══════════════════════════════════════════════════════════\n")

  // Auth
  const jwt = await getJwt()
  console.log("✅ Authenticated\n")

  // Check proxy
  try {
    const health = await safeFetch(PROXY_URL + "/health", { method: "GET", headers: {} })
    if (!health.ok) throw new Error("proxy not responding")
    console.log("✅ Proxy reachable\n")
  } catch {
    console.error("❌ Proxy not reachable at", PROXY_URL)
    console.error("   Start with: bun run dev start --port 4141")
    process.exit(1)
  }

  // Phase 1: Discovery
  console.log("Phase 1: Fetching models...")
  const allModels = await fetchModels(jwt)
  let chatModels = filterChatModels(allModels)
  if (modelFilter) chatModels = chatModels.filter((m) => m.id.includes(modelFilter))
  // Skip variants for smoke tests (test base models only)
  chatModels = chatModels.filter((m) => !m.id.endsWith("-high") && !m.id.endsWith("-xhigh") && !m.id.endsWith("-internal"))

  const profiles = chatModels.map(classifyModel)
  console.log(`  Found ${allModels.length} total, testing ${profiles.length} chat models\n`)

  // Phase 2: Smoke tests (3 dimensions)
  console.log("Phase 2: Smoke tests (Direct + Proxy /messages + Proxy /chat)...\n")
  const blameResults: BlameResult[] = []
  const hdrs = copilotHeaders(jwt)

  for (const profile of profiles) {
    process.stdout.write(`  ${profile.id.padEnd(25)}`)

    // Text test across 3 dimensions
    const directText = await testText(profile, directUrl(profile), hdrs, directFormat(profile))
    const proxyMsgText = await testText(profile, PROXY_MSG_URL, proxyHeaders(), "anthropic")
    const proxyChatText = await testText(profile, PROXY_CHAT_URL, chatHeaders(), "openai")

    const textBlame = computeBlame(directText.status, proxyMsgText.status === "pass" && proxyChatText.status === "pass" ? "pass" : "fail")
    blameResults.push({
      model: profile.id, test: "text",
      direct: directText.status, proxyMessages: proxyMsgText.status, proxyChat: proxyChatText.status,
      blame: textBlame, detail: proxyChatText.detail ?? proxyMsgText.detail ?? directText.detail,
      durationMs: directText.durationMs + proxyMsgText.durationMs + proxyChatText.durationMs,
    })

    const tIcon = (s: string) => (s === "pass" ? "✅" : s === "skip" ? "⬜" : "❌")
    process.stdout.write(`text: ${tIcon(directText.status)}${tIcon(proxyMsgText.status)}${tIcon(proxyChatText.status)}`)

    if (!quickMode) {
      // Tools
      const directTool = await testTools(profile, directUrl(profile), hdrs, directFormat(profile))
      const proxyMsgTool = await testTools(profile, PROXY_MSG_URL, proxyHeaders(), "anthropic")
      const proxyChatTool = await testTools(profile, PROXY_CHAT_URL, chatHeaders(), "openai")

      const toolBlame = computeBlame(directTool.status, proxyMsgTool.status === "pass" && proxyChatTool.status === "pass" ? "pass" : proxyMsgTool.status === "skip" ? "skip" : "fail")
      blameResults.push({
        model: profile.id, test: "tools",
        direct: directTool.status, proxyMessages: proxyMsgTool.status, proxyChat: proxyChatTool.status,
        blame: toolBlame, detail: proxyChatTool.detail ?? proxyMsgTool.detail,
        durationMs: directTool.durationMs + proxyMsgTool.durationMs + proxyChatTool.durationMs,
      })
      process.stdout.write(` tools: ${tIcon(directTool.status)}${tIcon(proxyMsgTool.status)}${tIcon(proxyChatTool.status)}`)

      // Stream
      const directStream = await testStream(profile, directUrl(profile), hdrs, directFormat(profile))
      const proxyMsgStream = await testStream(profile, PROXY_MSG_URL, proxyHeaders(), "anthropic")
      const proxyChatStream = await testStream(profile, PROXY_CHAT_URL, chatHeaders(), "openai")

      const streamBlame = computeBlame(directStream.status, proxyMsgStream.status === "pass" && proxyChatStream.status === "pass" ? "pass" : "fail")
      blameResults.push({
        model: profile.id, test: "stream",
        direct: directStream.status, proxyMessages: proxyMsgStream.status, proxyChat: proxyChatStream.status,
        blame: streamBlame, detail: proxyChatStream.detail ?? proxyMsgStream.detail,
        durationMs: directStream.durationMs + proxyMsgStream.durationMs + proxyChatStream.durationMs,
      })
      process.stdout.write(` stream: ${tIcon(directStream.status)}${tIcon(proxyMsgStream.status)}${tIcon(proxyChatStream.status)}`)
    }

    console.log()
  }

  // Phase 3: Proxy translation tests
  console.log("\nPhase 3: Proxy translation tests...")
  const translationResults = await runTranslationTests()
  for (const r of translationResults) {
    const icon = r.status === "pass" ? "✅" : "❌"
    console.log(`  ${icon} ${r.test} (${r.model})`)
  }

  // Phase 4: Capability probes
  let capabilityResults: CapabilityProbe[] = []
  if (runCapabilities) {
    console.log("\nPhase 4: Capability probes (direct upstream)...")
    capabilityResults = await runCapabilityProbes(jwt, profiles)
  }

  // Phase 5: Report
  const report = generateReport(allModels, profiles, blameResults, translationResults, capabilityResults)
  const reportPath = join(import.meta.dir, "reports", "proxy-validation-latest.md")
  await writeFile(reportPath, report)
  console.log(`\n📄 Report: ${reportPath}`)

  // Also write capability matrix as JSON
  if (capabilityResults.length > 0) {
    const jsonPath = join(import.meta.dir, "reports", "capability-matrix.json")
    await writeFile(jsonPath, JSON.stringify(capabilityResults, null, 2))
    console.log(`📊 Capability matrix: ${jsonPath}`)
  }

  // Summary
  const bugs = blameResults.filter((r) => r.blame === "proxy-bug").length
  const ok = blameResults.filter((r) => r.blame === "ok" || r.blame === "proxy-fix").length
  console.log(`\n═══════════════════════════════════════════════════════════`)
  console.log(` Results: ${ok} ok, ${bugs} proxy bugs, ${blameResults.filter((r) => r.blame === "upstream").length} upstream`)
  console.log(`═══════════════════════════════════════════════════════════`)

  if (bugs > 0) process.exit(1)
}

main().catch((e) => {
  console.error("Fatal:", e)
  process.exit(1)
})