Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Add Responses API support and model-level routing
- Route gpt-5.3-codex requests through the Responses API since it
  doesn't work with chat/completions
- Add model(level) suffix parsing for reasoning effort control
  (e.g. gpt-5.3-codex(high), claude-opus-4.6(medium))
- Add direct /v1/responses endpoint for passthrough access
- Expand /v1/models to list level-suffixed variants
- Pass through Claude thinking config when level is specified
- Fix translateModelName to not clobber 4.6 model names
  • Loading branch information
Godzilla675 committed Mar 4, 2026
commit ca194508ad6dcfd494a1677a96990906b273903d
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ These endpoints mimic the OpenAI API structure.
| Endpoint | Method | Description |
| --------------------------- | ------ | --------------------------------------------------------- |
| `POST /v1/chat/completions` | `POST` | Creates a model response for the given chat conversation. |
| `POST /v1/responses` | `POST` | Creates a model response using the Responses API format. |
| `GET /v1/models` | `GET` | Lists the currently available models. |
| `POST /v1/embeddings` | `POST` | Creates an embedding vector representing the input text. |

Expand Down
38 changes: 38 additions & 0 deletions src/lib/model-level.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
export const MODEL_LEVELS = ["low", "medium", "high", "xhigh"] as const

export type ModelLevel = (typeof MODEL_LEVELS)[number]

export const MODEL_LEVEL_VARIANTS = {
"gpt-5.3-codex": MODEL_LEVELS,
"claude-opus-4.6": ["low", "medium", "high"],
"claude-opus-4.6-fast": ["low", "medium", "high"],
"claude-sonnet-4.6": ["low", "medium", "high"],
} as const satisfies Record<string, ReadonlyArray<ModelLevel>>

export const parseModelNameWithLevel = (
model: string,
): {
baseModel: string
level: ModelLevel | undefined
} => {
const match = model.match(/^(.+)\((low|medium|high|xhigh)\)$/)
if (!match) {
return {
baseModel: model,
level: undefined,
}
}

return {
baseModel: match[1],
level: match[2] as ModelLevel,
}
}

export const isCodexResponsesModel = (model: string): boolean =>
model === "gpt-5.3-codex"

export const isClaudeThinkingModel = (model: string): boolean =>
model === "claude-opus-4.6"
|| model === "claude-opus-4.6-fast"
|| model === "claude-sonnet-4.6"
50 changes: 48 additions & 2 deletions src/routes/chat-completions/handler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ import consola from "consola"
import { streamSSE, type SSEMessage } from "hono/streaming"

import { awaitApproval } from "~/lib/approval"
import {
isCodexResponsesModel,
parseModelNameWithLevel,
} from "~/lib/model-level"
import { checkRateLimit } from "~/lib/rate-limit"
import { state } from "~/lib/state"
import { getTokenCount } from "~/lib/tokenizer"
Expand All @@ -12,17 +16,29 @@ import {
createChatCompletions,
type ChatCompletionResponse,
type ChatCompletionsPayload,
normalizeChatCompletionsPayloadModel,
} from "~/services/copilot/create-chat-completions"
import {
createResponses,
type ResponsesApiResponse,
} from "~/services/copilot/create-responses"

import {
translateChatCompletionsToResponses,
translateResponsesStreamToChatStream,
translateResponsesToChatCompletions,
} from "./responses-translation"

export async function handleCompletion(c: Context) {
await checkRateLimit(state)

let payload = await c.req.json<ChatCompletionsPayload>()
const { baseModel } = parseModelNameWithLevel(payload.model)
consola.debug("Request payload:", JSON.stringify(payload).slice(-400))

// Find the selected model
const selectedModel = state.models?.data.find(
(model) => model.id === payload.model,
(model) => model.id === baseModel,
)

// Calculate and display token count
Expand All @@ -47,7 +63,33 @@ export async function handleCompletion(c: Context) {
consola.debug("Set max_tokens to:", JSON.stringify(payload.max_tokens))
}

const response = await createChatCompletions(payload)
const normalizedPayload = normalizeChatCompletionsPayloadModel(payload)

if (isCodexResponsesModel(baseModel)) {
const responsesPayload =
translateChatCompletionsToResponses(normalizedPayload)
const responses = await createResponses(responsesPayload)

if (isNonStreamingResponse(responses)) {
const completionResponse = translateResponsesToChatCompletions(responses)
consola.debug(
"Codex translated response:",
JSON.stringify(completionResponse).slice(-400),
)
return c.json(completionResponse)
}

return streamSSE(c, async (stream) => {
for await (const chunk of translateResponsesStreamToChatStream(
responses,
normalizedPayload.model,
)) {
await stream.writeSSE(chunk)
}
})
}

const response = await createChatCompletions(normalizedPayload)

if (isNonStreaming(response)) {
consola.debug("Non-streaming response:", JSON.stringify(response))
Expand All @@ -63,6 +105,10 @@ export async function handleCompletion(c: Context) {
})
}

const isNonStreamingResponse = (
response: Awaited<ReturnType<typeof createResponses>>,
): response is ResponsesApiResponse => !(Symbol.asyncIterator in response)

const isNonStreaming = (
response: Awaited<ReturnType<typeof createChatCompletions>>,
): response is ChatCompletionResponse => Object.hasOwn(response, "choices")
253 changes: 253 additions & 0 deletions src/routes/chat-completions/responses-translation.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
import type { SSEMessage } from "hono/streaming"

import { randomUUID } from "node:crypto"

import type {
ChatCompletionChunk,
ChatCompletionResponse,
ChatCompletionsPayload,
ContentPart,
Message,
ToolCall,
} from "~/services/copilot/create-chat-completions"
import type {
ResponseInputContentPart,
ResponseInputMessage,
ResponsesApiResponse,
ResponsesFunctionCall,
ResponsesOutputContentPart,
ResponsesOutputItem,
ResponsesPayload,
} from "~/services/copilot/create-responses"

export function translateChatCompletionsToResponses(
payload: ChatCompletionsPayload,
): ResponsesPayload {
return {
model: payload.model,
input: payload.messages.map((message) => translateMessage(message)),
stream: payload.stream,
temperature: payload.temperature,
top_p: payload.top_p,
max_output_tokens: payload.max_tokens,
stop: payload.stop,
tools: payload.tools as Array<unknown> | null | undefined,
tool_choice: payload.tool_choice,
user: payload.user,
reasoning_effort: payload.reasoning_effort,
reasoning:
payload.reasoning
?? (payload.reasoning_effort ?
{
effort: payload.reasoning_effort,
}
: undefined),
}
}

export function translateResponsesToChatCompletions(
response: ResponsesApiResponse,
): ChatCompletionResponse {
const outputItems = response.output ?? []
const messageContent = extractOutputText(outputItems, response.output_text)
const toolCalls = extractToolCalls(outputItems)
const completionTokens = response.usage?.output_tokens ?? 0
const promptTokens = response.usage?.input_tokens ?? 0

return {
id: response.id,
object: "chat.completion",
created: response.created_at ?? Math.floor(Date.now() / 1000),
model: response.model,
choices: [
{
index: 0,
message: {
role: "assistant",
content: messageContent.length > 0 ? messageContent : null,
...(toolCalls.length > 0 && { tool_calls: toolCalls }),
},
logprobs: null,
finish_reason: toolCalls.length > 0 ? "tool_calls" : "stop",
},
],
usage: {
prompt_tokens: promptTokens,
completion_tokens: completionTokens,
total_tokens:
response.usage?.total_tokens ?? promptTokens + completionTokens,
},
}
}

export async function* translateResponsesStreamToChatStream(
responseStream: AsyncIterable<{ data?: string }>,
model: string,
): AsyncGenerator<SSEMessage> {
const completionId = randomUUID()
const created = Math.floor(Date.now() / 1000)
let hasEmittedContent = false

for await (const rawEvent of responseStream) {
if (rawEvent.data === "[DONE]") {
const endChunk: ChatCompletionChunk = {
id: completionId,
object: "chat.completion.chunk",
created,
model,
choices: [
{
index: 0,
delta: {},
finish_reason: "stop",
logprobs: null,
},
],
}
yield { data: JSON.stringify(endChunk) }
yield { data: "[DONE]" }
return
}

if (!rawEvent.data) {
continue
}

const parsedEvent = JSON.parse(rawEvent.data) as {
type?: string
delta?: string
}

if (
parsedEvent.type === "response.output_text.delta"
&& typeof parsedEvent.delta === "string"
) {
const chunk: ChatCompletionChunk = {
id: completionId,
object: "chat.completion.chunk",
created,
model,
choices: [
{
index: 0,
delta: {
...(hasEmittedContent ? {} : { role: "assistant" }),
content: parsedEvent.delta,
},
finish_reason: null,
logprobs: null,
},
],
}
hasEmittedContent = true
yield { data: JSON.stringify(chunk) }
}
}
Comment on lines +113 to +340
Copy link

Copilot AI Mar 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

translateResponsesStreamToChatStream currently ignores all Responses streaming event types except response.output_text.delta, so streamed tool/function call events (and any other deltas) will be dropped. This breaks the PR’s claim that tool calls are translated for streaming and can lead to clients never receiving tool_calls deltas / correct finish_reason. Extend the stream translator to handle function/tool-call related event types and emit the corresponding Chat Completions chunk deltas (including a final chunk with finish_reason: "tool_calls" when applicable).

Copilot uses AI. Check for mistakes.
}
Comment on lines +84 to +341
Copy link

Copilot AI Mar 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are tests for non-streaming chat<->responses translation, but the streaming translator translateResponsesStreamToChatStream is untested. Add unit tests that feed representative Responses SSE events (text deltas, function/tool-call deltas, and termination) and assert the emitted Chat Completions chunks (including role emission, tool_calls, and correct finish_reason).

Copilot uses AI. Check for mistakes.

function translateMessage(message: Message): ResponseInputMessage {
let content: ResponseInputMessage["content"]
if (typeof message.content === "string") {
content = message.content
} else if (message.content === null) {
content = ""
} else {
content = message.content.map((part) => translateContentPart(part))
}

return {
role: message.role,
content,
}
Comment on lines +353 to +365
Copy link

Copilot AI Mar 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

translateMessage only forwards role and content and drops chat-completions fields like tool_call_id, tool_calls, and name. If a client includes prior assistant tool calls or sends tool results (role "tool" with tool_call_id) in the conversation history, that information will be lost when converting to the Responses API, which can break multi-turn tool workflows. Preserve these fields by mapping them into the Responses input schema (including associating tool results with the correct call id).

Suggested change
return {
role: message.role,
content,
}
const translated: ResponseInputMessage = {
role: message.role,
content,
...(message as any).name ? { name: (message as any).name } : {},
...(message as any).tool_call_id
? { tool_call_id: (message as any).tool_call_id }
: {},
...(Array.isArray((message as any).tool_calls)
? { tool_calls: (message as any).tool_calls }
: {}),
} as ResponseInputMessage
return translated

Copilot uses AI. Check for mistakes.
}

function translateContentPart(part: ContentPart): ResponseInputContentPart {
if (part.type === "text") {
return {
type: "input_text",
text: part.text,
}
}

return {
type: "input_image",
image_url: part.image_url.url,
detail: part.image_url.detail,
}
}

function extractOutputText(
outputItems: Array<ResponsesOutputItem>,
outputText: string | undefined,
): string {
if (outputText) {
return outputText
}

const parts = outputItems.flatMap((item) => {
if (item.type !== "message") {
return []
}

if (typeof item.content === "string") {
return [item.content]
}

if (!Array.isArray(item.content)) {
return []
}

return item.content.flatMap((contentPart) =>
contentPart.type === "output_text" ? [contentPart.text] : [],
)
})

return parts.join("")
}

function extractToolCalls(
outputItems: Array<ResponsesOutputItem>,
): Array<ToolCall> {
const toolCalls: Array<ToolCall> = []
for (const item of outputItems) {
if (item.type === "function_call") {
toolCalls.push(translateFunctionCall(item))
continue
}

if (typeof item.content === "string" || !Array.isArray(item.content)) {
continue
}

for (const contentPart of item.content) {
if (isResponsesFunctionCall(contentPart)) {
toolCalls.push(translateFunctionCall(contentPart))
}
}
}

return toolCalls
}

function translateFunctionCall(functionCall: ResponsesFunctionCall): ToolCall {
return {
id: functionCall.call_id ?? functionCall.id ?? randomUUID(),
type: "function",
function: {
name: functionCall.name,
arguments: functionCall.arguments,
},
}
}

function isResponsesFunctionCall(
value: ResponsesOutputContentPart,
): value is ResponsesFunctionCall {
return (
value.type === "function_call"
&& "name" in value
&& typeof value.name === "string"
&& "arguments" in value
&& typeof value.arguments === "string"
)
}
Loading
Loading