Add Responses API support and model-level routing

- Route gpt-5.3-codex requests through the Responses API since it doesn't work with chat/completions - Add model(level) suffix parsing for reasoning effort control (e.g. gpt-5.3-codex(high), claude-opus-4.6(medium)) - Add direct /v1/responses endpoint for passthrough access - Expand /v1/models to list level-suffixed variants - Pass through Claude thinking config when level is specified - Fix translateModelName to not clobber 4.6 model names
ericc-ch · Godzilla675 · Mar 4, 2026 · Mar 13, 2026 · Mar 20, 2026 · Mar 20, 2026
commit ca194508ad6dcfd494a1677a96990906b273903d
diff --git a/README.md b/README.md
@@ -188,6 +188,7 @@ These endpoints mimic the OpenAI API structure.
 | Endpoint                    | Method | Description                                               |
 | --------------------------- | ------ | --------------------------------------------------------- |
 | `POST /v1/chat/completions` | `POST` | Creates a model response for the given chat conversation. |
+| `POST /v1/responses`        | `POST` | Creates a model response using the Responses API format.  |
 | `GET /v1/models`            | `GET`  | Lists the currently available models.                     |
 | `POST /v1/embeddings`       | `POST` | Creates an embedding vector representing the input text.  |
 

diff --git a/src/lib/model-level.ts b/src/lib/model-level.ts
@@ -0,0 +1,38 @@
+export const MODEL_LEVELS = ["low", "medium", "high", "xhigh"] as const
+
+export type ModelLevel = (typeof MODEL_LEVELS)[number]
+
+export const MODEL_LEVEL_VARIANTS = {
+  "gpt-5.3-codex": MODEL_LEVELS,
+  "claude-opus-4.6": ["low", "medium", "high"],
+  "claude-opus-4.6-fast": ["low", "medium", "high"],
+  "claude-sonnet-4.6": ["low", "medium", "high"],
+} as const satisfies Record<string, ReadonlyArray<ModelLevel>>
+
+export const parseModelNameWithLevel = (
+  model: string,
+): {
+  baseModel: string
+  level: ModelLevel | undefined
+} => {
+  const match = model.match(/^(.+)\((low|medium|high|xhigh)\)$/)
+  if (!match) {
+    return {
+      baseModel: model,
+      level: undefined,
+    }
+  }
+
+  return {
+    baseModel: match[1],
+    level: match[2] as ModelLevel,
+  }
+}
+
+export const isCodexResponsesModel = (model: string): boolean =>
+  model === "gpt-5.3-codex"
+
+export const isClaudeThinkingModel = (model: string): boolean =>
+  model === "claude-opus-4.6"
+  || model === "claude-opus-4.6-fast"
+  || model === "claude-sonnet-4.6"
diff --git a/src/routes/chat-completions/handler.ts b/src/routes/chat-completions/handler.ts
@@ -4,6 +4,10 @@ import consola from "consola"
 import { streamSSE, type SSEMessage } from "hono/streaming"
 
 import { awaitApproval } from "~/lib/approval"
+import {
+  isCodexResponsesModel,
+  parseModelNameWithLevel,
+} from "~/lib/model-level"
 import { checkRateLimit } from "~/lib/rate-limit"
 import { state } from "~/lib/state"
 import { getTokenCount } from "~/lib/tokenizer"
@@ -12,17 +16,29 @@ import {
   createChatCompletions,
   type ChatCompletionResponse,
   type ChatCompletionsPayload,
+  normalizeChatCompletionsPayloadModel,
 } from "~/services/copilot/create-chat-completions"
+import {
+  createResponses,
+  type ResponsesApiResponse,
+} from "~/services/copilot/create-responses"
+
+import {
+  translateChatCompletionsToResponses,
+  translateResponsesStreamToChatStream,
+  translateResponsesToChatCompletions,
+} from "./responses-translation"
 
 export async function handleCompletion(c: Context) {
   await checkRateLimit(state)
 
   let payload = await c.req.json<ChatCompletionsPayload>()
+  const { baseModel } = parseModelNameWithLevel(payload.model)
   consola.debug("Request payload:", JSON.stringify(payload).slice(-400))
 
   // Find the selected model
   const selectedModel = state.models?.data.find(
-    (model) => model.id === payload.model,
+    (model) => model.id === baseModel,
   )
 
   // Calculate and display token count
@@ -47,7 +63,33 @@ export async function handleCompletion(c: Context) {
     consola.debug("Set max_tokens to:", JSON.stringify(payload.max_tokens))
   }
 
-  const response = await createChatCompletions(payload)
+  const normalizedPayload = normalizeChatCompletionsPayloadModel(payload)
+
+  if (isCodexResponsesModel(baseModel)) {
+    const responsesPayload =
+      translateChatCompletionsToResponses(normalizedPayload)
+    const responses = await createResponses(responsesPayload)
+
+    if (isNonStreamingResponse(responses)) {
+      const completionResponse = translateResponsesToChatCompletions(responses)
+      consola.debug(
+        "Codex translated response:",
+        JSON.stringify(completionResponse).slice(-400),
+      )
+      return c.json(completionResponse)
+    }
+
+    return streamSSE(c, async (stream) => {
+      for await (const chunk of translateResponsesStreamToChatStream(
+        responses,
+        normalizedPayload.model,
+      )) {
+        await stream.writeSSE(chunk)
+      }
+    })
+  }
+
+  const response = await createChatCompletions(normalizedPayload)
 
   if (isNonStreaming(response)) {
     consola.debug("Non-streaming response:", JSON.stringify(response))
@@ -63,6 +105,10 @@ export async function handleCompletion(c: Context) {
   })
 }
 
+const isNonStreamingResponse = (
+  response: Awaited<ReturnType<typeof createResponses>>,
+): response is ResponsesApiResponse => !(Symbol.asyncIterator in response)
+
 const isNonStreaming = (
   response: Awaited<ReturnType<typeof createChatCompletions>>,
 ): response is ChatCompletionResponse => Object.hasOwn(response, "choices")
diff --git a/src/routes/chat-completions/responses-translation.ts b/src/routes/chat-completions/responses-translation.ts
@@ -0,0 +1,253 @@
+import type { SSEMessage } from "hono/streaming"
+
+import { randomUUID } from "node:crypto"
+
+import type {
+  ChatCompletionChunk,
+  ChatCompletionResponse,
+  ChatCompletionsPayload,
+  ContentPart,
+  Message,
+  ToolCall,
+} from "~/services/copilot/create-chat-completions"
+import type {
+  ResponseInputContentPart,
+  ResponseInputMessage,
+  ResponsesApiResponse,
+  ResponsesFunctionCall,
+  ResponsesOutputContentPart,
+  ResponsesOutputItem,
+  ResponsesPayload,
+} from "~/services/copilot/create-responses"
+
+export function translateChatCompletionsToResponses(
+  payload: ChatCompletionsPayload,
+): ResponsesPayload {
+  return {
+    model: payload.model,
+    input: payload.messages.map((message) => translateMessage(message)),
+    stream: payload.stream,
+    temperature: payload.temperature,
+    top_p: payload.top_p,
+    max_output_tokens: payload.max_tokens,
+    stop: payload.stop,
+    tools: payload.tools as Array<unknown> | null | undefined,
+    tool_choice: payload.tool_choice,
+    user: payload.user,
+    reasoning_effort: payload.reasoning_effort,
+    reasoning:
+      payload.reasoning
+      ?? (payload.reasoning_effort ?
+        {
+          effort: payload.reasoning_effort,
+        }
+      : undefined),
+  }
+}
+
+export function translateResponsesToChatCompletions(
+  response: ResponsesApiResponse,
+): ChatCompletionResponse {
+  const outputItems = response.output ?? []
+  const messageContent = extractOutputText(outputItems, response.output_text)
+  const toolCalls = extractToolCalls(outputItems)
+  const completionTokens = response.usage?.output_tokens ?? 0
+  const promptTokens = response.usage?.input_tokens ?? 0
+
+  return {
+    id: response.id,
+    object: "chat.completion",
+    created: response.created_at ?? Math.floor(Date.now() / 1000),
+    model: response.model,
+    choices: [
+      {
+        index: 0,
+        message: {
+          role: "assistant",
+          content: messageContent.length > 0 ? messageContent : null,
+          ...(toolCalls.length > 0 && { tool_calls: toolCalls }),
+        },
+        logprobs: null,
+        finish_reason: toolCalls.length > 0 ? "tool_calls" : "stop",
+      },
+    ],
+    usage: {
+      prompt_tokens: promptTokens,
+      completion_tokens: completionTokens,
+      total_tokens:
+        response.usage?.total_tokens ?? promptTokens + completionTokens,
+    },
+  }
+}
+
+export async function* translateResponsesStreamToChatStream(
+  responseStream: AsyncIterable<{ data?: string }>,
+  model: string,
+): AsyncGenerator<SSEMessage> {
+  const completionId = randomUUID()
+  const created = Math.floor(Date.now() / 1000)
+  let hasEmittedContent = false
+
+  for await (const rawEvent of responseStream) {
+    if (rawEvent.data === "[DONE]") {
+      const endChunk: ChatCompletionChunk = {
+        id: completionId,
+        object: "chat.completion.chunk",
+        created,
+        model,
+        choices: [
+          {
+            index: 0,
+            delta: {},
+            finish_reason: "stop",
+            logprobs: null,
+          },
+        ],
+      }
+      yield { data: JSON.stringify(endChunk) }
+      yield { data: "[DONE]" }
+      return
+    }
+
+    if (!rawEvent.data) {
+      continue
+    }
+
+    const parsedEvent = JSON.parse(rawEvent.data) as {
+      type?: string
+      delta?: string
+    }
+
+    if (
+      parsedEvent.type === "response.output_text.delta"
+      && typeof parsedEvent.delta === "string"
+    ) {
+      const chunk: ChatCompletionChunk = {
+        id: completionId,
+        object: "chat.completion.chunk",
+        created,
+        model,
+        choices: [
+          {
+            index: 0,
+            delta: {
+              ...(hasEmittedContent ? {} : { role: "assistant" }),
+              content: parsedEvent.delta,
+            },
+            finish_reason: null,
+            logprobs: null,
+          },
+        ],
+      }
+      hasEmittedContent = true
+      yield { data: JSON.stringify(chunk) }
+    }
+  }
+}
+
+function translateMessage(message: Message): ResponseInputMessage {
+  let content: ResponseInputMessage["content"]
+  if (typeof message.content === "string") {
+    content = message.content
+  } else if (message.content === null) {
+    content = ""
+  } else {
+    content = message.content.map((part) => translateContentPart(part))
+  }
+
+  return {
+    role: message.role,
+    content,
+  }
-  return {
-    role: message.role,
-    content,
-  }
+  const translated: ResponseInputMessage = {
+    role: message.role,
+    content,
+    ...(message as any).name ? { name: (message as any).name } : {},
+    ...(message as any).tool_call_id
+      ? { tool_call_id: (message as any).tool_call_id }
+      : {},
+    ...(Array.isArray((message as any).tool_calls)
+      ? { tool_calls: (message as any).tool_calls }
+      : {}),
+  } as ResponseInputMessage
+
+  return translated
-  return {
-    role: message.role,
-    content,
-  }
+  const translated: ResponseInputMessage = {
+    role: message.role,
+    content,
+    ...(message as any).name ? { name: (message as any).name } : {},
+    ...(message as any).tool_call_id
+      ? { tool_call_id: (message as any).tool_call_id }
+      : {},
+    ...(Array.isArray((message as any).tool_calls)
+      ? { tool_calls: (message as any).tool_calls }
+      : {}),
+  } as ResponseInputMessage
+
+  return translated
+}
+
+function translateContentPart(part: ContentPart): ResponseInputContentPart {
+  if (part.type === "text") {
+    return {
+      type: "input_text",
+      text: part.text,
+    }
+  }
+
+  return {
+    type: "input_image",
+    image_url: part.image_url.url,
+    detail: part.image_url.detail,
+  }
+}
+
+function extractOutputText(
+  outputItems: Array<ResponsesOutputItem>,
+  outputText: string | undefined,
+): string {
+  if (outputText) {
+    return outputText
+  }
+
+  const parts = outputItems.flatMap((item) => {
+    if (item.type !== "message") {
+      return []
+    }
+
+    if (typeof item.content === "string") {
+      return [item.content]
+    }
+
+    if (!Array.isArray(item.content)) {
+      return []
+    }
+
+    return item.content.flatMap((contentPart) =>
+      contentPart.type === "output_text" ? [contentPart.text] : [],
+    )
+  })
+
+  return parts.join("")
+}
+
+function extractToolCalls(
+  outputItems: Array<ResponsesOutputItem>,
+): Array<ToolCall> {
+  const toolCalls: Array<ToolCall> = []
+  for (const item of outputItems) {
+    if (item.type === "function_call") {
+      toolCalls.push(translateFunctionCall(item))
+      continue
+    }
+
+    if (typeof item.content === "string" || !Array.isArray(item.content)) {
+      continue
+    }
+
+    for (const contentPart of item.content) {
+      if (isResponsesFunctionCall(contentPart)) {
+        toolCalls.push(translateFunctionCall(contentPart))
+      }
+    }
+  }
+
+  return toolCalls
+}
+
+function translateFunctionCall(functionCall: ResponsesFunctionCall): ToolCall {
+  return {
+    id: functionCall.call_id ?? functionCall.id ?? randomUUID(),
+    type: "function",
+    function: {
+      name: functionCall.name,
+      arguments: functionCall.arguments,
+    },
+  }
+}
+
+function isResponsesFunctionCall(
+  value: ResponsesOutputContentPart,
+): value is ResponsesFunctionCall {
+  return (
+    value.type === "function_call"
+    && "name" in value
+    && typeof value.name === "string"
+    && "arguments" in value
+    && typeof value.arguments === "string"
+  )
+}