diff --git a/tests/test_analysis_internals.py b/tests/test_analysis_internals.py new file mode 100644 index 0000000..9374664 --- /dev/null +++ b/tests/test_analysis_internals.py @@ -0,0 +1,438 @@ +"""Unit tests for the analysis internals: ATIF-trajectory parsing, the events+OTel merge, and +the OTel attribute/time decoders. + +These use **synthetic, hand-controlled** inputs (not captured artifacts). The ATIF +``trajectory.json`` is produced by this package's own Pier converter, so real trajectories can't +serve as authoritative golden data — instead we drive ``analyze_trajectory`` with crafted inputs +and assert the parsing/aggregation logic directly. Real, externally-produced ``events.jsonl`` / +``copilot-otel.jsonl`` ground truth lives in ``test_real_sessions.py``. +""" + +from __future__ import annotations + +import datetime as _dt + +from copilot_experiments.analysis import ( + _otel_attr_value, + _otel_attrs, + _otel_value, + _parse_otel_time, + analyze_events, + analyze_trajectory, + llm_calls_from_otel, +) + +# --------------------------------------------------------------------------- # +# analyze_trajectory (synthetic ATIF) +# --------------------------------------------------------------------------- # + + +def _agent_step( + *, + ts: str, + message: str = "Tool call", + tools: list[tuple[str, str]] | None = None, + completion_tokens: int | None = None, + failed_call_ids: list[str] | None = None, + model: str = "claude-opus-4.7", +) -> dict: + """Build one ``source="agent"`` ATIF step.""" + tool_calls = [{"tool_call_id": cid, "function_name": name} for name, cid in (tools or [])] + results = [] + for _name, cid in tools or []: + content = '{"code": "failure"}' if failed_call_ids and cid in failed_call_ids else "ok" + results.append({"source_call_id": cid, "content": content}) + step: dict = { + "timestamp": ts, + "source": "agent", + "model_name": model, + "message": message, + "tool_calls": tool_calls, + "observation": {"results": results}, + } + if completion_tokens is not None: + step["metrics"] = {"completion_tokens": completion_tokens} + return step + + +def _trajectory(*, steps: list[dict], final_metrics: dict | None = None) -> dict: + traj: dict = { + "schema_version": "ATIF-v1.7", + "session_id": "sess-xyz", + "agent": {"name": "copilot-cli", "version": "1.0.65", "model_name": "claude-opus-4.7"}, + "steps": steps, + } + if final_metrics is not None: + traj["final_metrics"] = final_metrics + return traj + + +def _full_trajectory() -> dict: + return _trajectory( + steps=[ + {"timestamp": "2026-01-01T00:00:00.000Z", "source": "user", "message": "fix it"}, + _agent_step( + ts="2026-01-01T00:00:01.000Z", + message="reading the file", + tools=[("view", "c1"), ("glob", "c2")], + completion_tokens=40, + ), + _agent_step( + ts="2026-01-01T00:00:03.000Z", + tools=[("bash", "c3")], + completion_tokens=20, + failed_call_ids=["c3"], + ), + _agent_step( + ts="2026-01-01T00:00:05.000Z", + message="done", + tools=[("bash", "c4")], + completion_tokens=10, + ), + ], + final_metrics={ + "total_prompt_tokens": 1000, + "total_completion_tokens": 70, + "total_cached_tokens": 800, + "extra": { + "aiu": 2.5, + "reasoning_tokens": 12, + "peak_context_tokens": 5000, + "summarization_count": 1, + }, + }, + ) + + +def test_trajectory_basic_counts_and_metadata(): + a = analyze_trajectory(_full_trajectory()) + assert a.producer == "ATIF trajectory" + assert a.session_id == "sess-xyz" + assert a.copilot_version == "1.0.65" + assert a.models == ["claude-opus-4.7"] + assert a.n_events == 4 # one user + three agent steps + assert a.n_user_messages == 1 + assert a.n_assistant_messages == 3 + assert a.n_turns == 3 + + +def test_trajectory_tool_histogram_and_failures(): + a = analyze_trajectory(_full_trajectory()) + assert a.n_tool_calls == 4 + assert a.n_tool_failures == 1 + by_name = {t.name: t for t in a.tools} + assert by_name["bash"].calls == 2 + assert by_name["bash"].failures == 1 + assert by_name["view"].calls == 1 + # Sorted by descending calls then name: bash (2) first. + assert a.tools[0].name == "bash" + + +def test_trajectory_final_metrics_drive_economics(): + a = analyze_trajectory(_full_trajectory()) + assert a.input_tokens == 1000 + assert a.output_tokens == 70 + assert a.total_tokens == 1070 + assert a.economics.input_tokens_total == 1000 + assert a.economics.input_tokens_noncached == 1000 + assert a.economics.output_tokens == 70 + assert a.economics.cache_read_tokens == 800 + assert a.economics.aiu == 2.5 + assert a.economics.reasoning_tokens == 12 + assert a.economics.peak_context_tokens == 5000 + assert a.economics.n_compactions == 1 + + +def test_trajectory_duration_from_timestamps(): + a = analyze_trajectory(_full_trajectory()) + assert a.duration_s == 5.0 + assert a.started_at is not None and a.started_at.endswith("Z") + assert a.finished_at is not None and a.finished_at.endswith("Z") + + +def test_trajectory_falls_back_to_summed_output_without_final_completion(): + # final_metrics present but no total_completion_tokens -> sum step completion_tokens. + traj = _trajectory( + steps=[ + {"timestamp": "2026-01-01T00:00:00Z", "source": "user", "message": "go"}, + _agent_step(ts="2026-01-01T00:00:01Z", tools=[("view", "c1")], completion_tokens=15), + _agent_step(ts="2026-01-01T00:00:02Z", tools=[("bash", "c2")], completion_tokens=25), + ], + final_metrics={"total_prompt_tokens": 500}, + ) + a = analyze_trajectory(traj) + assert a.input_tokens == 500 + assert a.output_tokens == 40 # 15 + 25 + assert a.total_tokens == 540 + + +def test_trajectory_without_final_metrics_uses_summed_output(): + traj = _trajectory( + steps=[ + {"timestamp": "2026-01-01T00:00:00Z", "source": "user", "message": "go"}, + _agent_step(ts="2026-01-01T00:00:01Z", tools=[("view", "c1")], completion_tokens=7), + ] + ) + a = analyze_trajectory(traj) + assert a.output_tokens == 7 + assert a.economics.output_tokens == 7 + + +def test_trajectory_permission_denied_marker_counts_as_failure(): + traj = _trajectory( + steps=[ + _agent_step(ts="2026-01-01T00:00:01Z", tools=[("bash", "c1")]), + ] + ) + # Override the observation to use a "permission denied" style marker. + traj["steps"][0]["observation"]["results"][0]["content"] = "Error: Permission denied" + a = analyze_trajectory(traj) + assert a.n_tool_failures == 1 + + +def test_trajectory_tool_without_function_name_is_unknown(): + traj = _trajectory( + steps=[ + { + "timestamp": "2026-01-01T00:00:01Z", + "source": "agent", + "tool_calls": [{"tool_call_id": "c1"}], # no function_name + "observation": {"results": []}, + } + ] + ) + a = analyze_trajectory(traj) + assert a.tools[0].name == "unknown" + assert a.n_tool_calls == 1 + + +def test_trajectory_handles_malformed_input_gracefully(): + # steps not a list; agent not a dict. + a = analyze_trajectory({"agent": "nope", "steps": "nope"}) + assert a.n_events == 0 + assert a.n_turns == 0 + assert a.duration_s is None + # Empty dict. + b = analyze_trajectory({}) + assert b.n_events == 0 + assert b.models == [] + + +def test_trajectory_skips_non_dict_steps(): + traj = _trajectory( + steps=[ + "garbage", # type: ignore[list-item] + {"timestamp": "2026-01-01T00:00:01Z", "source": "user", "message": "go"}, + _agent_step(ts="2026-01-01T00:00:02Z", tools=[("view", "c1")], completion_tokens=5), + ] + ) + a = analyze_trajectory(traj) + assert a.n_user_messages == 1 + assert a.n_assistant_messages == 1 + + +# --------------------------------------------------------------------------- # +# _apply_otel_records merge behavior +# --------------------------------------------------------------------------- # + + +def _otel_chat_span(turn_id: str, *, in_tok: int, out_tok: int, cache_read: int, nano: int) -> dict: + return { + "type": "span", + "name": "chat claude-opus-4.7", + "startTime": [1782573865, 0], + "endTime": [1782573866, 0], + "attributes": { + "gen_ai.operation.name": "chat", + "gen_ai.request.model": "claude-opus-4.7", + "gen_ai.usage.input_tokens": in_tok, + "gen_ai.usage.output_tokens": out_tok, + "gen_ai.usage.cache_read.input_tokens": cache_read, + "github.copilot.nano_aiu": nano, + "github.copilot.turn_id": turn_id, + "github.copilot.server_duration": 900, + }, + } + + +def _events_with_turn(turn_id: str, *, with_shutdown: bool) -> list[dict]: + events = [ + {"type": "session.start", "timestamp": "2026-01-01T00:00:00Z", "data": {}}, + { + "type": "assistant.turn_start", + "timestamp": "2026-01-01T00:00:01Z", + "data": {"turnId": turn_id}, + }, + { + "type": "assistant.message", + "timestamp": "2026-01-01T00:00:01.2Z", + "data": {"model": "claude-opus-4.7", "content": "hi"}, + }, + {"type": "assistant.turn_end", "timestamp": "2026-01-01T00:00:02Z", "data": {}}, + ] + if with_shutdown: + events.append( + { + "type": "session.shutdown", + "timestamp": "2026-01-01T00:00:03Z", + "data": { + "tokenDetails": { + "input": {"tokenCount": 100}, + "cache_read": {"tokenCount": 900}, + "output": {"tokenCount": 50}, + }, + "totalNanoAiu": 1_000_000_000, + }, + } + ) + return events + + +def test_merge_shutdown_stays_authoritative_but_turns_enriched(): + events = _events_with_turn("0", with_shutdown=True) + otel = [_otel_chat_span("0", in_tok=999, out_tok=9, cache_read=888, nano=5_000_000_000)] + a = analyze_events(events, otel) + # Aggregate economics come from shutdown, NOT OTel. + assert a.total_tokens == 100 + 900 + 50 + assert a.economics.aiu == 1.0 + # llm_calls populated from OTel. + assert len(a.llm_calls) == 1 + # The matching turn is enriched with OTel per-call numbers. + turn = {t.turn_id: t for t in a.turns}["0"] + assert turn.input_tokens == 999 + assert turn.cache_read_input_tokens == 888 + assert turn.aiu == 5.0 + assert turn.api_duration_ms == 900 + + +def test_merge_otel_reconstructs_totals_without_shutdown(): + events = _events_with_turn("0", with_shutdown=False) + otel = [ + _otel_chat_span("0", in_tok=500, out_tok=20, cache_read=400, nano=2_000_000_000), + _otel_chat_span("0", in_tok=600, out_tok=30, cache_read=450, nano=3_000_000_000), + ] + a = analyze_events(events, otel) + # No shutdown -> OTel reconstructs aggregate economics. + assert a.input_tokens == 1100 + assert a.output_tokens == 50 + assert a.total_tokens == 1150 + assert a.economics.cache_read_tokens == 850 + assert a.economics.aiu == 5.0 + assert a.economics.n_requests == 2 + assert a.economics.api_duration_ms == 1800 + + +def test_merge_unmatched_turn_id_is_skipped(): + events = _events_with_turn("0", with_shutdown=False) + otel = [_otel_chat_span("99", in_tok=10, out_tok=1, cache_read=0, nano=1_000_000_000)] + a = analyze_events(events, otel) + # No crash; the call is still recorded and totals reconstructed. + assert len(a.llm_calls) == 1 + assert a.input_tokens == 10 + # Turn "0" got no enrichment (its input stays None). + assert {t.turn_id: t for t in a.turns}["0"].input_tokens is None + + +def test_merge_no_otel_calls_leaves_analysis_untouched(): + events = _events_with_turn("0", with_shutdown=True) + a = analyze_events(events, [{"type": "span", "name": "execute_tool view", "attributes": {}}]) + assert a.llm_calls == [] + assert a.total_tokens == 100 + 900 + 50 # still from shutdown + + +# --------------------------------------------------------------------------- # +# OTel attribute / value / time decoders (alternate encodings) +# --------------------------------------------------------------------------- # + + +def test_otel_attrs_from_dict_form(): + assert _otel_attrs({"a": 1, "b": "x"}) == {"a": 1, "b": "x"} + + +def test_otel_attrs_from_otlp_list_form(): + raw = [ + {"key": "gen_ai.usage.input_tokens", "value": {"intValue": "42"}}, + {"key": "model", "value": {"stringValue": "gpt-5.5"}}, + ] + assert _otel_attrs(raw) == {"gen_ai.usage.input_tokens": "42", "model": "gpt-5.5"} + + +def test_otel_attrs_other_returns_empty(): + assert _otel_attrs("nope") == {} + assert _otel_attrs(None) == {} + + +def test_otel_value_typed_scalars(): + assert _otel_value({"stringValue": "s"}) == "s" + assert _otel_value({"intValue": "5"}) == "5" + assert _otel_value({"doubleValue": 1.5}) == 1.5 + assert _otel_value({"boolValue": True}) is True + assert _otel_value(7) == 7 # passthrough non-dict + + +def test_otel_value_array_and_kvlist(): + arr = {"arrayValue": {"values": [{"stringValue": "a"}, {"stringValue": "b"}]}} + assert _otel_value(arr) == ["a", "b"] + kv = {"kvlistValue": {"values": [{"key": "k", "value": {"intValue": "9"}}]}} + assert _otel_value(kv) == {"k": "9"} + + +def test_otel_attr_value_flat_and_nested(): + # Flat dotted key present directly. + assert _otel_attr_value({"a.b.c": 5}, "a.b.c") == 5 + # Nested dict under a dotted prefix. + assert _otel_attr_value({"a.b": {"c": 7}}, "a.b.c") == 7 + # Missing. + assert _otel_attr_value({}, "a.b.c") is None + + +def test_parse_otel_time_iso_string(): + t = _parse_otel_time("2026-01-01T00:00:00Z") + assert t == _dt.datetime(2026, 1, 1, tzinfo=_dt.UTC) + + +def test_parse_otel_time_seconds_nanos_array(): + t = _parse_otel_time([1782573865, 500_000_000]) + assert t is not None + assert t.tzinfo is not None + # 0.5s of nanos -> 500000 microseconds. + assert t.microsecond == 500_000 + + +def test_parse_otel_time_epoch_units(): + base = _dt.datetime.fromtimestamp(1782573865, tz=_dt.UTC) + assert _parse_otel_time(1782573865).replace(microsecond=0) == base # seconds + assert _parse_otel_time(1782573865000).replace(microsecond=0) == base # millis + assert _parse_otel_time(1782573865000000000).replace(microsecond=0) == base # nanos + + +def test_parse_otel_time_invalid(): + assert _parse_otel_time("not-a-time") is None + assert _parse_otel_time(None) is None + + +def test_llm_calls_from_otel_with_otlp_list_attributes(): + span = { + "type": "span", + "name": "chat gpt-5.5", + "startTime": [1782573865, 0], + "endTime": [1782573866, 0], + "attributes": [ + {"key": "gen_ai.operation.name", "value": {"stringValue": "chat"}}, + {"key": "gen_ai.request.model", "value": {"stringValue": "gpt-5.5"}}, + {"key": "gen_ai.usage.input_tokens", "value": {"intValue": "1234"}}, + {"key": "gen_ai.usage.output_tokens", "value": {"intValue": "56"}}, + {"key": "gen_ai.usage.cache_read.input_tokens", "value": {"intValue": "1000"}}, + {"key": "github.copilot.nano_aiu", "value": {"intValue": "1500000000"}}, + {"key": "github.copilot.turn_id", "value": {"stringValue": "0"}}, + ], + } + calls = llm_calls_from_otel([span]) + assert len(calls) == 1 + c = calls[0] + assert c.request_model == "gpt-5.5" + assert c.input_tokens == 1234 + assert c.output_tokens == 56 + assert c.cache_read_input_tokens == 1000 + assert c.aiu == 1.5 + assert c.turn_id == "0" diff --git a/tests/test_render.py b/tests/test_render.py index d94cfa6..36ef38d 100644 --- a/tests/test_render.py +++ b/tests/test_render.py @@ -47,3 +47,99 @@ def test_live_formatter_skips_noise_but_surfaces_errors(): def test_live_formatter_falls_back_to_raw_for_non_json(): fmt = LiveEventFormatter() assert fmt.format("not json at all") == "not json at all" + + +# --------------------------------------------------------------------------- # +# render_session_analysis: full-overview rendering incl. the multi-model table +# --------------------------------------------------------------------------- # + + +def _render_to_text(analysis) -> str: + from io import StringIO + + from rich.console import Console + + from copilot_experiments.render import render_session_analysis + + buf = StringIO() + console = Console(file=buf, width=200, force_terminal=False, color_system=None) + render_session_analysis(analysis, console, title="Test") + return buf.getvalue() + + +def test_render_session_analysis_includes_per_model_table_when_multimodel(): + from copilot_experiments.models import ModelMetric, SessionAnalysis, TokenEconomics, TurnSummary + + econ = TokenEconomics( + input_tokens_noncached=100, + cache_read_tokens=900, + cache_write_tokens=0, + output_tokens=50, + input_tokens_total=1000, + total_tokens=1050, + aiu=2.5, + n_requests=3, + api_duration_ms=1500, + aiu_by_type={"input": 1.0, "cache_read": 0.5, "cache_write": 0.0, "output": 1.0}, + model_metrics=[ + ModelMetric(model="claude-opus-4.7", requests=2, input_tokens=600, output_tokens=30), + ModelMetric(model="gpt-5.5", requests=1, input_tokens=400, output_tokens=20), + ], + ) + analysis = SessionAnalysis( + session_id="s1", + models=["claude-opus-4.7", "gpt-5.5"], + n_turns=1, + turns=[TurnSummary(turn_no=1, started_at="2026-01-01T00:00:00Z")], + economics=econ, + ) + out = _render_to_text(analysis) + assert "Per model" in out + assert "claude-opus-4.7" in out + assert "gpt-5.5" in out + assert "Cost (AIU)" in out + + +def test_render_session_analysis_minimal_without_economics(): + from copilot_experiments.models import SessionAnalysis, TurnSummary + + analysis = SessionAnalysis( + session_id="s2", + models=["gpt-5.5"], + n_turns=1, + turns=[TurnSummary(turn_no=1, started_at="2026-01-01T00:00:00Z")], + warnings=["heads up"], + ) + out = _render_to_text(analysis) + # No shutdown economics -> the Cost table is omitted, but warnings still render. + assert "Cost (AIU)" not in out + assert "heads up" in out + + +# --------------------------------------------------------------------------- # +# LiveEventFormatter: remaining event branches +# --------------------------------------------------------------------------- # + + +def test_live_formatter_session_end_and_turn_end(): + fmt = LiveEventFormatter() + # turn_end before any turn -> nothing to show. + assert fmt.format(_ev("assistant.turn_end")) is None + fmt.format(_ev("assistant.turn_start")) + assert fmt.format(_ev("assistant.turn_end")) == "[turn 1] end" + assert fmt.format(_ev("session.end")) == "[session] end" + + +def test_live_formatter_empty_assistant_message_is_skipped(): + fmt = LiveEventFormatter() + assert fmt.format(_ev("assistant.message", content="")) is None + + +def test_live_formatter_non_dict_event_returns_raw(): + fmt = LiveEventFormatter() + assert fmt.format("[1, 2, 3]") == "[1, 2, 3]" + + +def test_live_formatter_warning_branch(): + fmt = LiveEventFormatter() + assert fmt.format(_ev("session.warning", message="careful")) == "[warn] careful" diff --git a/tests/test_sessionlog.py b/tests/test_sessionlog.py index 441b0fd..ea49b1b 100644 --- a/tests/test_sessionlog.py +++ b/tests/test_sessionlog.py @@ -54,3 +54,70 @@ def test_parse_metrics_empty(): m = parse_metrics([]) assert m.n_turns == 0 assert m.duration_s is None + + +# --------------------------------------------------------------------------- # +# load_events / copy_events filesystem helpers +# --------------------------------------------------------------------------- # + + +def test_load_events_skips_blank_and_invalid_lines(tmp_path): + from copilot_experiments.sessionlog import load_events + + path = tmp_path / "events.jsonl" + path.write_text( + '{"type": "session.start"}\n\n \nnot-json\n{"type": "user.message"}\n', + encoding="utf-8", + ) + events = load_events(path) + assert [e["type"] for e in events] == ["session.start", "user.message"] + + +def test_load_events_missing_file_returns_empty(tmp_path): + from copilot_experiments.sessionlog import load_events + + assert load_events(tmp_path / "nope.jsonl") == [] + + +def test_copy_events_roundtrip_with_base(tmp_path): + from copilot_experiments.sessionlog import copy_events, events_path + + base = tmp_path / "state" + src = events_path("sess-1", base) + src.parent.mkdir(parents=True) + src.write_text('{"type": "session.start"}\n', encoding="utf-8") + + dest = tmp_path / "out" / "events.jsonl" + assert copy_events("sess-1", dest, base) is True + assert dest.read_text(encoding="utf-8") == '{"type": "session.start"}\n' + + +def test_copy_events_missing_source_returns_false(tmp_path): + from copilot_experiments.sessionlog import copy_events + + dest = tmp_path / "out" / "events.jsonl" + assert copy_events("absent", dest, tmp_path / "state") is False + assert not dest.exists() + + +# --------------------------------------------------------------------------- # +# extract_economics: compaction_start peak tracking (no shutdown) +# --------------------------------------------------------------------------- # + + +def test_compaction_start_tracks_peak_without_shutdown(): + from copilot_experiments.sessionlog import extract_economics + + events = [ + { + "type": "session.compaction_start", + "data": { + "systemTokens": 1000, + "conversationTokens": 8000, + "toolDefinitionsTokens": 500, + }, + }, + ] + econ = extract_economics(events) + assert econ.total_tokens is None # no shutdown -> no authoritative totals + assert econ.peak_context_tokens == 9500 # 1000 + 8000 + 500