Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions src/copilot_experiments/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,7 @@ def llm_calls_from_otel(records: list[dict[str, Any]]) -> list[LlmCallSummary]:
input_tokens=input_tokens,
cache_read_input_tokens=_first_otel_int(
attrs,
"gen_ai.usage.cache_read.input_tokens",
"gen_ai.usage.cache_read_input_tokens",
"gen_ai.usage.cache_read_tokens",
"gen_ai.usage.cached_input_tokens",
Expand All @@ -354,8 +355,10 @@ def llm_calls_from_otel(records: list[dict[str, Any]]) -> list[LlmCallSummary]:
"gen_ai.usage.input_token_details.cache_read_tokens",
"gen_ai.usage.input_tokens_details.cached_tokens",
),
cache_creation_input_tokens=_otel_int(
attrs.get("gen_ai.usage.cache_creation_input_tokens")
cache_creation_input_tokens=_first_otel_int(
attrs,
"gen_ai.usage.cache_creation.input_tokens",
"gen_ai.usage.cache_creation_input_tokens",
),
output_tokens=output_tokens,
total_tokens=(
Expand Down
36 changes: 36 additions & 0 deletions tests/fixtures/real_sessions/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Real-session fixtures

Captured, **real** Copilot CLI session logs used to regression-test the session parsing,
economics, and analysis code against ground truth (not just hand-written synthetic events).

Each directory holds the two raw artifacts the Copilot CLI emits for a single trial:

- `events.jsonl` — the session event stream (`~/.copilot/session-state/<id>/events.jsonl`).
- `copilot-otel.jsonl` — the OTel span export (`--otel-file`), used to enrich the analysis.

| Directory | Model | Task |
| ---------------------- | ------------------------- | ---------------------------- |
| `fix_bug_gpt55` | `gpt-5.5` | Fix `multiply` in calculator |
| `fix_bug_claude_opus` | `claude-opus-4.7` | Fix `multiply` in calculator |
| `fix_bug_mai_flash` | `mai-code-1-flash-picker` | Fix `multiply` in calculator |
| `fix_bug_gemini_pro` | `gemini-3.1-pro-preview` | Fix `multiply` in calculator |

## Provenance

These sessions were produced by running the *same* `example-fix-bug` task (patch a one-line
bug in `calculator.py`) through the real GitHub Copilot CLI (v1.0.65+) across several models.
They were captured in a sibling experiment harness and copied here verbatim — no values were
edited. `fix_bug_mai_flash` and `fix_bug_gemini_pro` additionally exercise model identifiers
beyond the Claude/GPT families (`mai-code-1-flash-picker`, `gemini-3.1-pro-preview`).

## Why these are trustworthy as "golden" values

The expected numbers asserted in `tests/test_real_sessions.py` were cross-checked two ways:

1. Against the raw `session.shutdown` payload (the CLI's own authoritative totals).
2. Against an **independent** source in the same log: summing the per-request AIU from the OTel
`chat <model>` spans reproduces the shutdown's `totalNanoAiu` exactly. Two independent
accountings agreeing is strong evidence the parser is correct.

These files contain no secrets (BYOK keys are never written to the event log); only prompts,
tool calls, token counts, and the harness's own file paths (e.g. `/app/calculator.py`).

Large diffs are not rendered by default.

36 changes: 36 additions & 0 deletions tests/fixtures/real_sessions/fix_bug_claude_opus/events.jsonl

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions tests/fixtures/real_sessions/fix_bug_gemini_pro/copilot-otel.jsonl

Large diffs are not rendered by default.

43 changes: 43 additions & 0 deletions tests/fixtures/real_sessions/fix_bug_gemini_pro/events.jsonl

Large diffs are not rendered by default.

33 changes: 33 additions & 0 deletions tests/fixtures/real_sessions/fix_bug_gpt55/copilot-otel.jsonl

Large diffs are not rendered by default.

48 changes: 48 additions & 0 deletions tests/fixtures/real_sessions/fix_bug_gpt55/events.jsonl

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions tests/fixtures/real_sessions/fix_bug_mai_flash/copilot-otel.jsonl

Large diffs are not rendered by default.

44 changes: 44 additions & 0 deletions tests/fixtures/real_sessions/fix_bug_mai_flash/events.jsonl

Large diffs are not rendered by default.

275 changes: 275 additions & 0 deletions tests/test_real_sessions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,275 @@
"""Regression tests that exercise the session parser against **real** Copilot CLI logs.

Unlike the synthetic fixtures in ``test_sessionlog.py`` / ``test_economics.py``, these run the
parsing / economics / analysis pipeline over captured ``events.jsonl`` + ``copilot-otel.jsonl``
from genuine Copilot CLI runs (see ``fixtures/real_sessions/README.md``). The golden values were
cross-checked against the raw ``session.shutdown`` payload, and the cross-source invariants below
re-derive the AIU total from an *independent* part of each log (the OTel ``chat`` spans).
"""

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path

import pytest

from copilot_experiments.analysis import analyze_events
from copilot_experiments.sessionlog import extract_economics, load_events, parse_metrics

FIXTURES = Path(__file__).parent / "fixtures" / "real_sessions"


@dataclass(frozen=True)
class Expected:
"""Golden values verified against the raw ``session.shutdown`` payload."""

slug: str
model: str
n_turns: int
n_assistant_messages: int
n_tool_calls: int
n_tool_failures: int
tool_calls_by_name: dict[str, int]
input_tokens_noncached: int
cache_read_tokens: int
cache_write_tokens: int
output_tokens: int
input_tokens_total: int
total_tokens: int
reasoning_tokens: int | None
n_requests: int
api_duration_ms: int
system_tokens: int
tool_definitions_tokens: int
conversation_tokens: int
context_tokens: int
files_modified: int
lines_added: int
lines_removed: int
aiu: float
n_llm_calls: int


CASES = [
Expected(
slug="fix_bug_gpt55",
model="gpt-5.5",
n_turns=5,
n_assistant_messages=5,
n_tool_calls=7,
n_tool_failures=0,
tool_calls_by_name={"glob": 3, "bash": 2, "apply_patch": 1, "view": 1},
input_tokens_noncached=6283,
cache_read_tokens=80896,
cache_write_tokens=0,
output_tokens=494,
input_tokens_total=87179,
total_tokens=87673,
reasoning_tokens=53,
n_requests=5,
api_duration_ms=15934,
system_tokens=6624,
tool_definitions_tokens=11783,
conversation_tokens=909,
context_tokens=19319,
files_modified=1,
lines_added=1,
lines_removed=1,
aiu=8.6683,
n_llm_calls=5,
),
Expected(
slug="fix_bug_claude_opus",
model="claude-opus-4.7",
n_turns=5,
n_assistant_messages=5,
n_tool_calls=4,
n_tool_failures=0,
tool_calls_by_name={"bash": 2, "edit": 1, "view": 1},
input_tokens_noncached=10,
cache_read_tokens=117653,
cache_write_tokens=29793,
output_tokens=472,
input_tokens_total=147456,
total_tokens=147928,
reasoning_tokens=6,
n_requests=5,
api_duration_ms=14679,
system_tokens=6591,
tool_definitions_tokens=14493,
conversation_tokens=818,
context_tokens=21906,
files_modified=1,
lines_added=1,
lines_removed=1,
aiu=25.688275,
n_llm_calls=5,
),
Expected(
slug="fix_bug_mai_flash",
model="mai-code-1-flash-picker",
n_turns=5,
n_assistant_messages=5,
n_tool_calls=6,
n_tool_failures=0,
tool_calls_by_name={"bash": 3, "edit": 1, "glob": 1, "view": 1},
input_tokens_noncached=16443,
cache_read_tokens=64000,
cache_write_tokens=0,
output_tokens=634,
input_tokens_total=80443,
total_tokens=81077,
reasoning_tokens=None,
n_requests=5,
api_duration_ms=11155,
system_tokens=5150,
tool_definitions_tokens=12113,
conversation_tokens=889,
context_tokens=18155,
files_modified=1,
lines_added=1,
lines_removed=1,
aiu=1.998525,
n_llm_calls=5,
),
Expected(
slug="fix_bug_gemini_pro",
model="gemini-3.1-pro-preview",
n_turns=6,
n_assistant_messages=6,
n_tool_calls=5,
n_tool_failures=0,
tool_calls_by_name={"bash": 3, "edit": 1, "view": 1},
input_tokens_noncached=22257,
cache_read_tokens=84254,
cache_write_tokens=0,
output_tokens=201,
input_tokens_total=106511,
total_tokens=106712,
reasoning_tokens=289,
n_requests=6,
api_duration_ms=18677,
system_tokens=5934,
tool_definitions_tokens=12113,
conversation_tokens=656,
context_tokens=18706,
files_modified=1,
lines_added=1,
lines_removed=1,
aiu=6.37768,
n_llm_calls=6,
),
]
IDS = [c.slug for c in CASES]


def _events(slug: str) -> list[dict]:
return load_events(FIXTURES / slug / "events.jsonl")


def _otel(slug: str) -> list[dict]:
return load_events(FIXTURES / slug / "copilot-otel.jsonl")


@pytest.mark.parametrize("exp", CASES, ids=IDS)
def test_parse_metrics_matches_real_session(exp: Expected) -> None:
m = parse_metrics(_events(exp.slug))
assert m.models == [exp.model]
assert m.n_turns == exp.n_turns
assert m.n_assistant_messages == exp.n_assistant_messages
assert m.n_tool_calls == exp.n_tool_calls
assert m.n_tool_failures == exp.n_tool_failures
assert m.input_tokens == exp.input_tokens_total
assert m.output_tokens == exp.output_tokens
assert m.total_tokens == exp.total_tokens
assert m.cache_read_tokens == exp.cache_read_tokens
assert m.cache_write_tokens == exp.cache_write_tokens
assert m.reasoning_tokens == exp.reasoning_tokens
assert m.n_requests == exp.n_requests
assert m.aiu == exp.aiu
assert m.files_modified == exp.files_modified
assert m.lines_added == exp.lines_added
assert m.lines_removed == exp.lines_removed
assert m.duration_s is not None and m.duration_s > 0


@pytest.mark.parametrize("exp", CASES, ids=IDS)
def test_extract_economics_matches_real_session(exp: Expected) -> None:
e = extract_economics(_events(exp.slug))
assert e.input_tokens_noncached == exp.input_tokens_noncached
assert e.cache_read_tokens == exp.cache_read_tokens
assert e.cache_write_tokens == exp.cache_write_tokens
assert e.output_tokens == exp.output_tokens
assert e.input_tokens_total == exp.input_tokens_total
assert e.total_tokens == exp.total_tokens
assert e.reasoning_tokens == exp.reasoning_tokens
assert e.aiu == exp.aiu
assert e.n_requests == exp.n_requests
assert e.api_duration_ms == exp.api_duration_ms
assert e.system_tokens == exp.system_tokens
assert e.tool_definitions_tokens == exp.tool_definitions_tokens
assert e.conversation_tokens == exp.conversation_tokens
assert e.context_tokens == exp.context_tokens
assert e.files_modified == exp.files_modified
assert e.lines_added == exp.lines_added
assert e.lines_removed == exp.lines_removed


@pytest.mark.parametrize("exp", CASES, ids=IDS)
def test_token_split_components_sum_to_total(exp: Expected) -> None:
e = extract_economics(_events(exp.slug))
assert (
e.input_tokens_noncached + e.cache_read_tokens + e.cache_write_tokens
== e.input_tokens_total
)
assert e.input_tokens_total + e.output_tokens == e.total_tokens


@pytest.mark.parametrize("exp", CASES, ids=IDS)
def test_aiu_by_type_reconciles_with_total(exp: Expected) -> None:
e = extract_economics(_events(exp.slug))
# Each per-type AIU is independently rounded, so the sum may differ from the authoritative
# total by a rounding ULP; require reconciliation only to within that tolerance.
assert e.aiu is not None
assert abs(sum(e.aiu_by_type.values()) - e.aiu) <= 1e-6


@pytest.mark.parametrize("exp", CASES, ids=IDS)
def test_per_model_aiu_sums_to_session_total(exp: Expected) -> None:
e = extract_economics(_events(exp.slug))
assert len(e.model_metrics) == 1
assert e.model_metrics[0].model == exp.model
assert e.model_metrics[0].requests == exp.n_requests
assert round(sum(m.aiu or 0.0 for m in e.model_metrics), 6) == e.aiu


@pytest.mark.parametrize("exp", CASES, ids=IDS)
def test_analysis_tool_counts_match_real_session(exp: Expected) -> None:
a = analyze_events(_events(exp.slug), _otel(exp.slug))
by_name = {t.name: t for t in a.tools}
assert {n: t.calls for n, t in by_name.items()} == exp.tool_calls_by_name
assert sum(t.failures for t in a.tools) == exp.n_tool_failures
assert a.total_tokens == exp.total_tokens
assert a.economics.aiu == exp.aiu


@pytest.mark.parametrize("exp", CASES, ids=IDS)
def test_otel_call_aiu_reconciles_with_shutdown(exp: Expected) -> None:
"""The strongest cross-check: an independent source (per-request OTel ``chat`` spans)
must reproduce the ``session.shutdown`` AIU total to 4 decimals."""
a = analyze_events(_events(exp.slug), _otel(exp.slug))
assert len(a.llm_calls) == exp.n_llm_calls
otel_aiu = sum(c.aiu or 0.0 for c in a.llm_calls)
assert round(otel_aiu, 4) == round(a.economics.aiu, 4)


@pytest.mark.parametrize("exp", CASES, ids=IDS)
def test_otel_per_call_cache_tokens_reconcile_with_shutdown(exp: Expected) -> None:
"""Per-call cache read/write parsed from OTel ``chat`` spans (dotted
``gen_ai.usage.cache_read.input_tokens`` keys) must sum to the shutdown cache totals."""
a = analyze_events(_events(exp.slug), _otel(exp.slug))
cache_read = sum(c.cache_read_input_tokens or 0 for c in a.llm_calls)
cache_write = sum(c.cache_creation_input_tokens or 0 for c in a.llm_calls)
assert cache_read == exp.cache_read_tokens
assert cache_write == exp.cache_write_tokens
Loading