CopilotKit/showcase/integrations/langroid/tests/python/test_generate_a2ui.py at main · samuelson-chen/CopilotKit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""Unit tests for langroid's A2UI planner.

Sibling tests to ``showcase/integrations/google-adk/tests/python/test_generate_a2ui.py``
and ``showcase/integrations/strands/tests/python/test_generate_a2ui.py``. Covers:

- Provider-agnostic LLM routing through langroid's ``OpenAIGPT`` abstraction
  (which despite the name handles OpenAI / Anthropic / Gemini / any
  ``provider/model`` chat-model string).
- ``A2UI_MODEL`` env override takes precedence over ``LANGROID_MODEL``.
- Structured error surface (``_A2uiError``) for every failure branch:
    - LLM call raises (transport / auth / rate-limit)
    - response contains no tool call
    - response tool-call arguments are malformed JSON
- Happy path: valid tool call args → ``build_a2ui_operations_from_tool_call``
- Programmer errors (``AttributeError``, ``TypeError``, ``ImportError``,
  ``NameError``, ``AssertionError``, ``NotImplementedError``,
  ``ModuleNotFoundError``, ``pydantic.ValidationError``) propagate — not
  silently masked as LLM errors. Conversely ``KeyError`` / ``IndexError`` /
  ``RecursionError`` / ``MemoryError`` / ``LookupError`` are NO LONGER
  re-raised; they wrap into the structured ``a2ui_llm_error`` surface.
- Construction must not require OpenAI-specific env when a non-OpenAI
  ``LANGROID_MODEL`` is configured (provider-agnostic routing).
- Memoization: the A2UI planner LLM is built once per resolved model string.
- Structured warning / error log output on the module logger for every
  degraded / drift path (with message substring assertions).

Mocks live at the langroid-LLM layer (``lm.OpenAIGPT``) rather than at any
provider SDK layer — the whole point of the provider-agnostic fix is that
the A2UI planner no longer speaks to any provider SDK directly.
"""

from __future__ import annotations

import ast
import inspect
import json
import logging
import os
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
from types import SimpleNamespace
from typing import Any
from unittest.mock import MagicMock, patch

import pytest

from agents.agent import (
    _A2uiError,
    _A2uiErrorKind,
    _RENDER_A2UI_FUNCTION_SPEC,
    _a2ui_error,
    _get_a2ui_llm,
    _resolve_a2ui_model,
    _ToolErrorKind,
    generate_a2ui_via_llm,
    create_agent,
    ALL_TOOLS,
    BACKEND_TOOLS,
    FRONTEND_TOOLS,
    ChangeBackgroundTool,
    GenerateA2UITool,
    GenerateHaikuTool,
    GetSalesTodosTool,
    GetWeatherTool,
    ManageSalesTodosTool,
    QueryDataTool,
    ScheduleMeetingTool,
    SearchFlightsTool,
)
from langroid.agent.tool_message import ToolMessage


# ---------------------------------------------------------------------------
# Fakes / helpers
# ---------------------------------------------------------------------------


@dataclass
class _FakeFunction:
    """Typo-safe stand-in for a langroid tool-call ``function`` attribute.

    Dataclass (rather than ``SimpleNamespace``) so typos in field names blow
    up at construction rather than silently producing a shape that looks
    right but with a missing attribute.
    """

    name: str = "render_a2ui"
    arguments: Any = None


@dataclass
class _FakeOaiToolCall:
    """Typo-safe stand-in for a langroid ``OaiToolCall``."""

    id: str = "call-1"
    function: _FakeFunction | None = None


@dataclass
class _FakeFunctionCall:
    """Typo-safe stand-in for a legacy ``LLMFunctionCall``."""

    name: str = "render_a2ui"
    arguments: Any = None


@dataclass
class _FakeLLMResponse:
    """Typo-safe stand-in for langroid's ``LLMResponse``.

    The planner only reads ``.oai_tool_calls`` and ``.function_call`` so those
    are the only fields we model. Using a dataclass guards against silently
    adding unused attrs via typo.
    """

    message: str = ""
    oai_tool_calls: list | None = None
    function_call: _FakeFunctionCall | None = None


def _llm_response(*, tool_calls=None, function_call=None) -> _FakeLLMResponse:
    """Build a fake langroid ``LLMResponse``-shaped object."""
    return _FakeLLMResponse(
        message="",
        oai_tool_calls=tool_calls,
        function_call=function_call,
    )


def _oai_tool_call(*, arguments, call_id: str = "call-1") -> _FakeOaiToolCall:
    """Build a fake ``OpenAIToolCall``.

    Helper passes the ``arguments`` value through unchanged; callers may
    supply a dict or a JSON string — both paths are exercised by the tests
    below.
    """
    return _FakeOaiToolCall(
        id=call_id,
        function=_FakeFunction(name="render_a2ui", arguments=arguments),
    )


def _function_call(*, arguments) -> _FakeFunctionCall:
    """Build a fake legacy ``LLMFunctionCall``."""
    return _FakeFunctionCall(name="render_a2ui", arguments=arguments)


@pytest.fixture(autouse=True)
def _reset_llm_cache():
    """Clear the memoized A2UI LLM between tests so patched factories are
    honored freshly each test."""
    _get_a2ui_llm.cache_clear()
    yield
    _get_a2ui_llm.cache_clear()


@pytest.fixture(autouse=True)
def _clean_env(monkeypatch):
    """Unset A2UI_MODEL / LANGROID_MODEL / OPENAI_* / every provider key
    between tests so tests don't leak one another's env setup. The provider
    key set matches what ``_expected_key_for_model`` handles across sibling
    adapters — keeping them all unset here means a regression in provider
    routing can't be silently masked by a stray key in the developer's env.
    """
    for var in (
        "A2UI_MODEL",
        "LANGROID_MODEL",
        "OPENAI_API_KEY",
        "OPENAI_BASE_URL",
        "ANTHROPIC_API_KEY",
        "GEMINI_API_KEY",
        "OPENROUTER_API_KEY",
        "GROQ_API_KEY",
        "DEEPSEEK_API_KEY",
        "CEREBRAS_API_KEY",
        "GLHF_API_KEY",
        "MINIMAX_API_KEY",
        "PORTKEY_API_KEY",
    ):
        monkeypatch.delenv(var, raising=False)
    yield


# ---------------------------------------------------------------------------
# _A2uiErrorKind enum identity — pins the error-code contract
# ---------------------------------------------------------------------------


def test_a2ui_error_kind_values_pinned():
    """The enum ``.value``s are the string contract shared with the
    frontend renderer and the sibling ``google-adk`` / ``strands`` adapters.
    Renaming any of these is a cross-sibling breaking change; pin the set
    here so a regression is caught at unit-test time rather than by an
    alert from the renderer in production."""
    assert _A2uiErrorKind.LLM_ERROR.value == "a2ui_llm_error"
    assert _A2uiErrorKind.NO_TOOL_CALL.value == "a2ui_no_tool_call"
    assert _A2uiErrorKind.INVALID_ARGUMENTS.value == "a2ui_invalid_arguments"
    assert {m.value for m in _A2uiErrorKind} == {
        "a2ui_llm_error",
        "a2ui_no_tool_call",
        "a2ui_invalid_arguments",
    }


# ---------------------------------------------------------------------------
# _ToolErrorKind enum identity — pins the backend-tool error-code contract
# ---------------------------------------------------------------------------


def test_tool_error_kind_values_pinned():
    """The enum ``.value``s are the ``{"error": "<tool>_failed"}`` strings
    the outer LLM consumes when a backend tool handler wraps an impl
    exception. The values match the historical bare-string codes, so a
    rename here is a cross-language breaking change (the strings show up
    in prompt-engineered retry logic elsewhere in the product). Pin the
    complete set so a typo regression (``"get_wether_failed"``) or an
    accidental addition / removal is caught at unit-test time."""
    assert _ToolErrorKind.GET_WEATHER_FAILED.value == "get_weather_failed"
    assert _ToolErrorKind.QUERY_DATA_FAILED.value == "query_data_failed"
    assert _ToolErrorKind.MANAGE_SALES_TODOS_FAILED.value == "manage_sales_todos_failed"
    assert _ToolErrorKind.GET_SALES_TODOS_FAILED.value == "get_sales_todos_failed"
    assert _ToolErrorKind.SCHEDULE_MEETING_FAILED.value == "schedule_meeting_failed"
    assert _ToolErrorKind.SEARCH_FLIGHTS_FAILED.value == "search_flights_failed"
    assert {m.value for m in _ToolErrorKind} == {
        "get_weather_failed",
        "query_data_failed",
        "manage_sales_todos_failed",
        "get_sales_todos_failed",
        "schedule_meeting_failed",
        "search_flights_failed",
    }


# ---------------------------------------------------------------------------
# _a2ui_error contract
# ---------------------------------------------------------------------------


def test_a2ui_error_accepts_full_shape():
    err = _a2ui_error(error=_A2uiErrorKind.LLM_ERROR, message="m", remediation="r")
    assert err == {
        "error": "a2ui_llm_error",
        "message": "m",
        "remediation": "r",
    }


def test_a2ui_error_rejects_empty_values():
    """Empty-string values for message/remediation must blow up at
    construction time, not silently produce a malformed error surface.
    ``error`` is the enum now, so the only way to get an empty ``error``
    value would be to subvert the enum — not a supported use case."""
    with pytest.raises(ValueError):
        _a2ui_error(error=_A2uiErrorKind.LLM_ERROR, message="", remediation="r")
    with pytest.raises(ValueError):
        _a2ui_error(error=_A2uiErrorKind.LLM_ERROR, message="m", remediation="")


def test_a2ui_error_rejects_non_string_message():
    """The TypedDict annotation says ``str``; the factory must enforce that
    at runtime. A caller accidentally slipping a list/dict/int into
    ``message`` would break the frontend's error renderer."""
    with pytest.raises(ValueError):
        _a2ui_error(
            error=_A2uiErrorKind.LLM_ERROR,
            message=123,
            remediation="r",  # type: ignore[arg-type]
        )
    with pytest.raises(ValueError):
        _a2ui_error(
            error=_A2uiErrorKind.LLM_ERROR,
            message="m",
            remediation=["r"],  # type: ignore[arg-type]
        )


def _assert_full_error_shape(result: dict) -> None:
    """Every generate_a2ui error branch must include these three keys and
    ONLY these three keys (no traceback / stderr / secret leakage)."""
    assert isinstance(result, dict), f"expected dict, got {type(result).__name__}"
    # Exactly the three required keys — no extras (catches regressions that
    # leak tracebacks, stderr, or secret material into error dicts).
    assert set(result.keys()) == {"error", "message", "remediation"}, (
        f"unexpected keys in error result: {sorted(result.keys())}"
    )
    for key in ("error", "message", "remediation"):
        assert isinstance(result[key], str) and result[key], (
            f"'{key}' must be non-empty str; got {result.get(key)!r}"
        )


# ---------------------------------------------------------------------------
# Happy path
# ---------------------------------------------------------------------------


def test_generate_a2ui_happy_path_returns_operations():
    """A valid ``oai_tool_calls`` response should be routed through
    ``build_a2ui_operations_from_tool_call`` and return the operations dict.

    Pins the full op shape — surfaceId / catalogId / components / data —
    so a regression that swaps args or drops the data-update op is caught.
    Also asserts the LLM was called with the forced-function-call kwargs
    so the "the planner forces render_a2ui" contract is pinned.
    """
    fake_llm = MagicMock()
    args = {
        "surfaceId": "dynamic-surface",
        "catalogId": "copilotkit://app-dashboard-catalog",
        "components": [{"id": "root", "type": "Container"}],
        "data": {"greeting": "hi"},
    }
    fake_llm.chat.return_value = _llm_response(
        tool_calls=[_oai_tool_call(arguments=args)]
    )
    with patch("agents.agent._get_a2ui_llm", return_value=fake_llm):
        result = generate_a2ui_via_llm(context="test context")

    # LLM forced-function-call wiring
    fake_llm.chat.assert_called_once()
    call_kwargs = fake_llm.chat.call_args.kwargs
    assert call_kwargs["functions"] == [_RENDER_A2UI_FUNCTION_SPEC]
    assert call_kwargs["function_call"] == {"name": "render_a2ui"}

    # Message wiring: system prompt from caller's ``context``, plus a user
    # message instructing the planner to emit a dashboard. Both must be
    # present (langroid uses two-message system+user priming) — a regression
    # that drops one would still forward kwargs but emit a broken prompt.
    messages = call_kwargs["messages"]
    assert len(messages) == 2, f"expected system+user messages, got {len(messages)}"
    assert messages[0].content == "test context", (
        f"system message must carry caller's context verbatim; got "
        f"{messages[0].content!r}"
    )

    # Op shape
    assert "a2ui_operations" in result, f"unexpected shape: {result!r}"
    ops = result["a2ui_operations"]
    assert len(ops) == 3

    assert ops[0]["type"] == "create_surface"
    assert ops[0]["surfaceId"] == "dynamic-surface"
    assert ops[0]["catalogId"] == "copilotkit://app-dashboard-catalog"

    assert ops[1]["type"] == "update_components"
    assert ops[1]["components"] == [{"id": "root", "type": "Container"}]

    assert ops[2]["type"] == "update_data_model"
    assert ops[2]["data"] == {"greeting": "hi"}


def test_generate_a2ui_happy_path_json_string_arguments_also_work():
    """If a provider adapter returns ``arguments`` as a JSON string (not a
    pre-parsed dict — some langroid backends do this), the function must
    still parse and succeed. Assert the round-trip — ``surfaceId`` from the
    JSON string makes it into ``a2ui_operations[0].surfaceId`` — so that a
    regression where the parsed dict was dropped on the floor gets caught."""
    fake_llm = MagicMock()
    args_json = json.dumps(
        {
            "surfaceId": "s1",
            "catalogId": "copilotkit://app-dashboard-catalog",
            "components": [{"id": "root", "type": "Container"}],
        }
    )
    fake_llm.chat.return_value = _llm_response(
        tool_calls=[_oai_tool_call(arguments=args_json)]
    )
    with patch("agents.agent._get_a2ui_llm", return_value=fake_llm):
        result = generate_a2ui_via_llm(context="")
    assert "a2ui_operations" in result
    assert result["a2ui_operations"][0]["surfaceId"] == "s1"
    # No ``data`` in args → no update_data_model op → exactly 2 ops.
    assert len(result["a2ui_operations"]) == 2, (
        f"expected 2 ops when args has no 'data' key; got "
        f"{len(result['a2ui_operations'])}"
    )
    # Default system prompt must kick in when context is empty.
    call_kwargs = fake_llm.chat.call_args.kwargs
    messages = call_kwargs["messages"]
    assert messages[0].content == "Generate a useful dashboard UI.", (
        f"empty context must trigger the default system prompt; got "
        f"{messages[0].content!r}"
    )


def test_generate_a2ui_legacy_function_call_path():
    """Older / alternate providers surface the forced tool call via
    ``function_call`` rather than ``oai_tool_calls``. Both shapes must work.
    Pin ``surfaceId`` so we know the LEGACY slot's args were consumed (a
    regression that reads from the empty modern slot would fall through to
    ``a2ui_no_tool_call`` — but an even subtler regression could read the
    wrong slot's args)."""
    fake_llm = MagicMock()
    args = {
        "surfaceId": "legacy-surface",
        "catalogId": "copilotkit://app-dashboard-catalog",
        "components": [{"id": "root", "type": "Container"}],
    }
    fake_llm.chat.return_value = _llm_response(
        tool_calls=None,
        function_call=_function_call(arguments=args),
    )
    with patch("agents.agent._get_a2ui_llm", return_value=fake_llm):
        result = generate_a2ui_via_llm(context="")
    assert "a2ui_operations" in result
    assert result["a2ui_operations"][0]["surfaceId"] == "legacy-surface"


# ---------------------------------------------------------------------------
# Error branches
# ---------------------------------------------------------------------------


def test_generate_a2ui_llm_exception_returns_full_error_shape(caplog):
    """Runtime exception from ``llm.chat(...)`` → structured ``a2ui_llm_error``
    with all keys populated, and an ERROR-level log on the module logger.

    Also pin the remediation content — the entire point of the
    provider-agnostic fix is that the remediation points at
    ``LANGROID_MODEL`` / ``A2UI_MODEL`` rather than OpenAI-specific env
    variables."""
    fake_llm = MagicMock()
    fake_llm.chat.side_effect = ConnectionError("backend unreachable")
    with patch("agents.agent._get_a2ui_llm", return_value=fake_llm):
        with caplog.at_level(logging.ERROR, logger="agents.agent"):
            result = generate_a2ui_via_llm(context="")
    _assert_full_error_shape(result)
    assert result["error"] == "a2ui_llm_error"
    # Source formats the message as "...: ClassName: detail" — BOTH parts
    # must be present. Previously used `or`; that weakened the assertion
    # and would pass if the class name alone leaked through.
    assert "ConnectionError" in result["message"]
    assert "backend unreachable" in result["message"]
    # Remediation must reference the provider-agnostic env vars — catches a
    # regression that reintroduces "set OPENAI_API_KEY" phrasing.
    assert "LANGROID_MODEL" in result["remediation"]
    assert "A2UI_MODEL" in result["remediation"]
    # And the module logger must have emitted an ERROR record whose message
    # pins the substring from the source's ``logger.exception(...)`` call.
    assert any(
        rec.levelno >= logging.ERROR
        and rec.name == "agents.agent"
        and "LLM call failed" in rec.getMessage()
        for rec in caplog.records
    ), (
        f"expected ERROR-level log mentioning 'LLM call failed'; got "
        f"{[(r.name, r.levelname, r.getMessage()) for r in caplog.records]}"
    )


def test_generate_a2ui_llm_exception_message_is_truncated_to_200_chars():
    """The source truncates ``str(exc)`` to 200 chars to bound the blast
    radius if a future provider SDK regression embeds credentials /
    huge stack state in exception text. Regression-guard the truncation."""
    fake_llm = MagicMock()
    huge = "X" * 5000
    fake_llm.chat.side_effect = ConnectionError(huge)
    with patch("agents.agent._get_a2ui_llm", return_value=fake_llm):
        result = generate_a2ui_via_llm(context="")
    _assert_full_error_shape(result)
    # Message format: "Secondary A2UI LLM call failed: ConnectionError: <detail>"
    # The <detail> portion must be at most 200 chars (truncated from 5000).
    prefix = "Secondary A2UI LLM call failed: ConnectionError: "
    assert result["message"].startswith(prefix)
    detail = result["message"][len(prefix) :]
    # Pin the exact truncation: source slices ``str(exc)[:200]`` with a huge
    # input, so the detail must be EXACTLY the first 200 chars of the
    # stressor string, not merely <=200 (which would accept a regression
    # that truncated to e.g. 50 chars).
    assert detail == "X" * 200, (
        f"detail must be exactly 200 'X's from str(exc)[:200]; got {len(detail)} chars"
    )
    # And the total must not be anywhere near the original 5000.
    assert len(result["message"]) < 500


def test_generate_a2ui_llm_construction_failure_returns_full_error_shape():
    """Failure inside ``_get_a2ui_llm`` (e.g. missing provider-specific API
    key at construction time) must surface as a structured tool result
    rather than propagate as an uncaught exception."""

    def _raise(*_a, **_kw):
        raise ValueError("no API key for provider X")

    with patch("agents.agent._get_a2ui_llm", side_effect=_raise):
        result = generate_a2ui_via_llm(context="")

    _assert_full_error_shape(result)
    assert result["error"] == "a2ui_llm_error"
    # Message must carry both the exception class name (``ValueError``) and
    # the original detail substring (``no API key``) — a regression that
    # dropped ``str(exc)`` and left only the class name is caught.
    assert "ValueError" in result["message"]
    assert "no API key" in result["message"]


def test_generate_a2ui_no_tool_call_returns_full_error_shape():
    """LLM responded but emitted no tool call → a2ui_no_tool_call."""
    fake_llm = MagicMock()
    fake_llm.chat.return_value = _llm_response(tool_calls=None, function_call=None)
    with patch("agents.agent._get_a2ui_llm", return_value=fake_llm):
        result = generate_a2ui_via_llm(context="")
    _assert_full_error_shape(result)
    assert result["error"] == "a2ui_no_tool_call"


def test_generate_a2ui_empty_tool_calls_returns_full_error_shape():
    """``oai_tool_calls`` was an empty list → a2ui_no_tool_call."""
    fake_llm = MagicMock()
    fake_llm.chat.return_value = _llm_response(tool_calls=[], function_call=None)
    with patch("agents.agent._get_a2ui_llm", return_value=fake_llm):
        result = generate_a2ui_via_llm(context="")
    _assert_full_error_shape(result)
    assert result["error"] == "a2ui_no_tool_call"


def test_generate_a2ui_invalid_arguments_returns_full_error_shape():
    """Arguments that are a str but NOT valid JSON → a2ui_invalid_arguments."""
    fake_llm = MagicMock()
    fake_llm.chat.return_value = _llm_response(
        tool_calls=[_oai_tool_call(arguments="not json {{{")]
    )
    with patch("agents.agent._get_a2ui_llm", return_value=fake_llm):
        result = generate_a2ui_via_llm(context="")
    _assert_full_error_shape(result)
    assert result["error"] == "a2ui_invalid_arguments"


def test_generate_a2ui_non_dict_arguments_returns_full_error_shape():
    """Arguments valid JSON but not a dict (e.g. a list) →
    a2ui_invalid_arguments (build_a2ui_operations_from_tool_call expects
    a dict and we must not let the TypeError escape)."""
    fake_llm = MagicMock()
    fake_llm.chat.return_value = _llm_response(
        tool_calls=[_oai_tool_call(arguments=[1, 2, 3])]
    )
    with patch("agents.agent._get_a2ui_llm", return_value=fake_llm):
        result = generate_a2ui_via_llm(context="")
    _assert_full_error_shape(result)
    assert result["error"] == "a2ui_invalid_arguments"


@pytest.mark.parametrize(
    "exc_cls,exc_args",
    [
        (KeyError, ("missing surfaceId",)),
        (TypeError, ("nope",)),
        (ValueError, ("bad value",)),
    ],
)
def test_build_a2ui_operations_wrapper_catches_expected_errors(exc_cls, exc_args):
    """``build_a2ui_operations_from_tool_call`` raising any of the three
    expected classes (``KeyError`` / ``TypeError`` / ``ValueError``) on
    malformed args must wrap into ``a2ui_invalid_arguments`` rather than
    propagate. Parametrized so the three near-identical bodies don't drift
    (previously copy-pasted, which is exactly how one of the branches would
    silently fall out of sync on a refactor).
    """
    fake_llm = MagicMock()
    args = {
        "surfaceId": "s",
        "catalogId": "copilotkit://app-dashboard-catalog",
        "components": [{"id": "root", "type": "Container"}],
    }
    fake_llm.chat.return_value = _llm_response(
        tool_calls=[_oai_tool_call(arguments=args)]
    )
    with (
        patch("agents.agent._get_a2ui_llm", return_value=fake_llm),
        patch(
            "agents.agent.build_a2ui_operations_from_tool_call",
            side_effect=exc_cls(*exc_args),
        ),
    ):
        result = generate_a2ui_via_llm(context="")
    _assert_full_error_shape(result)
    assert result["error"] == "a2ui_invalid_arguments"


def test_build_a2ui_operations_boundary_validates_return_shape():
    """Even on the happy path, ``build_a2ui_operations_from_tool_call`` can
    theoretically drift and return a dict missing the ``a2ui_operations``
    key (e.g. upstream schema change). The planner MUST boundary-validate
    the return shape and surface a structured ``a2ui_invalid_arguments``
    rather than propagate the malformed dict to the frontend."""
    fake_llm = MagicMock()
    args = {
        "surfaceId": "s",
        "catalogId": "copilotkit://app-dashboard-catalog",
        "components": [{"id": "root", "type": "Container"}],
    }
    fake_llm.chat.return_value = _llm_response(
        tool_calls=[_oai_tool_call(arguments=args)]
    )
    # Patch the builder to return a malformed dict lacking "a2ui_operations".
    with (
        patch("agents.agent._get_a2ui_llm", return_value=fake_llm),
        patch(
            "agents.agent.build_a2ui_operations_from_tool_call",
            return_value={"unexpected_key": "foo"},
        ),
    ):
        result = generate_a2ui_via_llm(context="")
    _assert_full_error_shape(result)
    assert result["error"] == "a2ui_invalid_arguments"


# ---------------------------------------------------------------------------
# Programmer errors MUST propagate — not be silently swallowed.
# The narrow re-raise tuple is (AttributeError, TypeError, NameError,
# ImportError, ModuleNotFoundError, AssertionError, NotImplementedError,
# pydantic.ValidationError).
# ---------------------------------------------------------------------------


@pytest.mark.parametrize(
    "exc_cls,exc_args",
    [
        (AttributeError, ("typo",)),
        (TypeError, ("bad kwargs",)),
        (NameError, ("unknown name",)),
        (ImportError, ("bad import",)),
        (ModuleNotFoundError, ("no module",)),
        (AssertionError, ("assertion",)),
        (NotImplementedError, ("todo",)),
    ],
)
def test_generate_a2ui_lets_programmer_errors_propagate(exc_cls, exc_args):
    """Programmer-error exception classes must propagate uncaught rather
    than being wrapped as ``a2ui_llm_error``. Keeps genuine bugs visible
    in tests and server logs instead of silently masked."""
    fake_llm = MagicMock()
    fake_llm.chat.side_effect = exc_cls(*exc_args)
    with patch("agents.agent._get_a2ui_llm", return_value=fake_llm):
        with pytest.raises(exc_cls):
            generate_a2ui_via_llm(context="")


def test_generate_a2ui_propagates_pydantic_validation_error():
    """``pydantic.ValidationError`` indicates a schema bug (the planner's
    response could not be validated against the expected model), not a
    transport failure. It must propagate rather than be wrapped as
    ``a2ui_llm_error`` — the remediation for a schema bug is not "verify
    provider credentials"."""
    from pydantic import BaseModel, ValidationError

    class _Dummy(BaseModel):
        x: int

    # Trigger a real ValidationError so we have a legitimate instance to
    # raise — constructing ValidationError directly is tricky across
    # pydantic versions.
    try:
        _Dummy(x="not-an-int")  # type: ignore[arg-type]
    except ValidationError as ve:
        real_ve = ve
    else:  # pragma: no cover - defensive
        pytest.fail("expected pydantic to raise ValidationError")

    fake_llm = MagicMock()
    fake_llm.chat.side_effect = real_ve
    with patch("agents.agent._get_a2ui_llm", return_value=fake_llm):
        with pytest.raises(ValidationError):
            generate_a2ui_via_llm(context="")


@pytest.mark.parametrize(
    "exc_cls,exc_args",
    [
        (KeyError, ("missing",)),
        (IndexError, ("out of range",)),
        (RecursionError, ("too deep",)),
        (MemoryError, ()),
        (LookupError, ("lookup",)),
    ],
)
def test_generate_a2ui_wraps_recoverable_errors_into_llm_error(exc_cls, exc_args):
    """``KeyError`` / ``IndexError`` / ``LookupError`` / ``RecursionError`` /
    ``MemoryError`` are raised by SDK/adapter code as recoverable conditions
    on malformed provider payloads. They used to propagate, but the narrowed
    re-raise tuple now lets them fall through into the transport-error path
    so callers get the structured ``a2ui_llm_error`` surface with the
    correct "retry / verify provider" remediation rather than an uncaught
    500."""
    fake_llm = MagicMock()
    fake_llm.chat.side_effect = exc_cls(*exc_args)
    with patch("agents.agent._get_a2ui_llm", return_value=fake_llm):
        result = generate_a2ui_via_llm(context="")
    _assert_full_error_shape(result)
    assert result["error"] == "a2ui_llm_error"


def test_generate_a2ui_memory_error_without_args_produces_classname_only_message():
    """``MemoryError()`` carries no args, so ``str(exc) == ""``. The source's
    ``exc_detail = str(exc)[:200] if str(exc) else ""`` branch drops the
    trailing ``: <detail>`` segment, so the message must be exactly
    ``"Secondary A2UI LLM call failed: MemoryError"`` with no trailing
    colon. Pinning this catches a regression that always appends ``:`` even
    when detail is empty (which would produce ``"...: MemoryError: "`` and
    look subtly broken in the frontend)."""
    fake_llm = MagicMock()
    fake_llm.chat.side_effect = MemoryError()
    with patch("agents.agent._get_a2ui_llm", return_value=fake_llm):
        result = generate_a2ui_via_llm(context="")
    _assert_full_error_shape(result)
    assert result["error"] == "a2ui_llm_error"
    assert result["message"] == "Secondary A2UI LLM call failed: MemoryError", (
        f"empty-detail path must produce classname-only message (no trailing "
        f"': '); got {result['message']!r}"
    )


# ---------------------------------------------------------------------------
# _get_a2ui_llm: model resolution + keyed memoization + provider-agnostic
# ---------------------------------------------------------------------------


def test_a2ui_model_env_overrides_langroid_model(monkeypatch):
    """When ``A2UI_MODEL`` is set, the planner LLM must use it regardless
    of ``LANGROID_MODEL``."""
    monkeypatch.setenv("LANGROID_MODEL", "gpt-4.1")
    monkeypatch.setenv("A2UI_MODEL", "anthropic/claude-opus-4")

    captured_models: list[str] = []

    class _FakeLLM:
        def __init__(self, config):
            captured_models.append(config.chat_model)

        def chat(self, *_a, **_kw):
            return _llm_response(tool_calls=None)

    with patch("agents.agent.lm.OpenAIGPT", _FakeLLM):
        # Drive through the public path so model resolution runs.
        generate_a2ui_via_llm(context="")

    assert captured_models == ["anthropic/claude-opus-4"], (
        f"A2UI_MODEL should win; got {captured_models!r}"
    )


def test_langroid_model_used_when_a2ui_model_unset(monkeypatch):
    """When only ``LANGROID_MODEL`` is set, the planner LLM should inherit
    it — same provider as the primary chat agent."""
    monkeypatch.setenv("LANGROID_MODEL", "anthropic/claude-opus-4")

    captured_models: list[str] = []

    class _FakeLLM:
        def __init__(self, config):
            captured_models.append(config.chat_model)

        def chat(self, *_a, **_kw):
            return _llm_response(tool_calls=None)

    with patch("agents.agent.lm.OpenAIGPT", _FakeLLM):
        generate_a2ui_via_llm(context="")

    assert captured_models == ["anthropic/claude-opus-4"]


def test_default_model_when_no_env_set(monkeypatch):
    """With neither ``A2UI_MODEL`` nor ``LANGROID_MODEL`` set, the default
    chat_model must match the primary agent's default (``gpt-4.1``
    as documented in ``create_agent``). Pinning the string here catches a
    silent drift between the planner default and the primary default.

    Explicit ``monkeypatch.delenv`` on both vars (belt-and-suspenders
    alongside the autouse ``_clean_env`` fixture) so a future refactor of
    the fixture can't accidentally leak a stray env var into this test.
    """
    monkeypatch.delenv("A2UI_MODEL", raising=False)
    monkeypatch.delenv("LANGROID_MODEL", raising=False)
    captured_models: list[str] = []

    class _FakeLLM:
        def __init__(self, config):
            captured_models.append(config.chat_model)

        def chat(self, *_a, **_kw):
            return _llm_response(tool_calls=None)

    with patch("agents.agent.lm.OpenAIGPT", _FakeLLM):
        generate_a2ui_via_llm(context="")

    assert captured_models == ["gpt-4.1"]


def test_llm_memoization_returns_same_instance_for_same_model():
    """Two calls with the same resolved model must return the same LLM
    instance — rebuilding is wasted work and re-runs credential resolution."""
    sentinel = MagicMock()
    with patch("agents.agent.lm.OpenAIGPT", return_value=sentinel) as mock_cls:
        first = _get_a2ui_llm("gpt-4.1")
        second = _get_a2ui_llm("gpt-4.1")
    assert first is second is sentinel
    assert mock_cls.call_count == 1


def test_llm_memoization_is_keyed_per_model():
    """Different model strings must produce different instances, and each
    call must construct ``OpenAIGPT`` with the exact model string passed."""
    instances: list[MagicMock] = []
    captured_models: list[str] = []

    class _FakeLLM:
        def __init__(self, config):
            captured_models.append(config.chat_model)
            instances.append(self)  # type: ignore[arg-type]

        def chat(self, *_a, **_kw):  # pragma: no cover - not used here
            return _llm_response(tool_calls=None)

    with patch("agents.agent.lm.OpenAIGPT", _FakeLLM):
        a = _get_a2ui_llm("gpt-4.1")
        b = _get_a2ui_llm("anthropic/claude-opus-4")
        a2 = _get_a2ui_llm("gpt-4.1")

    assert a is not b, "different models must produce different instances"
    assert a is a2, "repeated calls for same model must hit the cache"
    assert captured_models == ["gpt-4.1", "anthropic/claude-opus-4"]


@pytest.mark.skipif(
    not os.environ.get("LANGROID_INTEGRATION_TESTS"),
    reason=(
        "Integration test: constructs real lm.OpenAIGPT. Set "
        "LANGROID_INTEGRATION_TESTS=1 to enable."
    ),
)
def test_construction_succeeds_without_openai_env_real_openaigpt(monkeypatch):
    """Regression guard (strong form): with ``LANGROID_MODEL=anthropic/...``
    set and NO ``OPENAI_*`` env variables, constructing the REAL
    ``lm.OpenAIGPT`` must not raise.

    This is the whole point of the provider-agnostic fix. langroid's
    ``OpenAIGPT`` class dispatches to the right provider based on the
    ``provider/model`` prefix; only that provider's credentials are
    required at construction time. Construction should be pure (config +
    env reads, no network), so calling it against an Anthropic-prefixed
    model with only ``ANTHROPIC_API_KEY`` set should succeed without
    requiring any OpenAI-specific env.

    Opt-in (``LANGROID_INTEGRATION_TESTS=1``) because langroid's
    ``OpenAIGPT.__init__`` has historically flirted with network / provider
    init; we keep the weaker model-string routing test (below) as the
    always-on unit-level line of defense.
    """
    monkeypatch.delenv("OPENAI_API_KEY", raising=False)
    monkeypatch.delenv("OPENAI_BASE_URL", raising=False)
    monkeypatch.setenv("LANGROID_MODEL", "anthropic/claude-opus-4")
    monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-test")

    import langroid.language_models as lm  # noqa: WPS433 — local import by design

    # Construct directly — this is the regression we're actually guarding.
    try:
        config = lm.OpenAIGPTConfig(
            chat_model="anthropic/claude-opus-4",
            stream=False,
        )
        llm = lm.OpenAIGPT(config)
    except Exception as exc:  # pragma: no cover - explicit failure path
        pytest.fail(
            f"OpenAIGPT construction must not require OpenAI env when model is "
            f"non-OpenAI; got {type(exc).__name__}: {exc}"
        )
    assert llm is not None


def test_construction_uses_correct_model_string_for_non_openai(monkeypatch):
    """Supplementary model-string routing test: with a non-OpenAI
    ``LANGROID_MODEL``, the planner constructs an ``OpenAIGPT`` with the
    exact model string. This is the weaker cousin of the strong-form
    regression guard above — it confirms the routing path even if the real
    constructor becomes impossible to unit-test (e.g. adds network I/O)."""
    monkeypatch.delenv("OPENAI_API_KEY", raising=False)
    monkeypatch.delenv("OPENAI_BASE_URL", raising=False)
    monkeypatch.setenv("LANGROID_MODEL", "anthropic/claude-opus-4")
    monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-test")

    captured_models: list[str] = []

    class _FakeLLM:
        def __init__(self, config):
            captured_models.append(config.chat_model)

        def chat(self, *_a, **_kw):
            return _llm_response(tool_calls=None)

    with patch("agents.agent.lm.OpenAIGPT", _FakeLLM):
        generate_a2ui_via_llm(context="")

    assert captured_models == ["anthropic/claude-opus-4"]


def test_agent_module_imports_cleanly_without_openai_env(tmp_path):
    """Honest import-time regression guard: importing ``agents.agent`` with
    no OpenAI-specific env must succeed. This catches any top-level
    ``openai.OpenAI()`` / ``openai.Client()`` call that would re-introduce
    a hard provider dependency.

    Runs in a SUBPROCESS so module-level state (specifically the
    ``_get_a2ui_llm`` ``lru_cache`` and any other module-scope singletons)
    in the parent interpreter is not perturbed by a reload. Previously we
    used ``importlib.reload`` which rebinds module-level function
    identities — downstream tests patching by name would then silently see
    a stale reference and leak state across tests. Subprocess isolation
    makes this test order-independent.
    """
    # Strip any OPENAI_* / LANGROID_* / A2UI_* env vars the child would
    # otherwise inherit, but keep everything else (PATH, HOME, etc.) so the
    # interpreter can actually start.
    env = {
        k: v
        for k, v in os.environ.items()
        if not k.startswith(("OPENAI_", "LANGROID_", "A2UI_"))
    }
    # Ensure the child can import ``agents.agent`` via the package's src/
    # directory — mirrors what conftest.py does for the parent.
    # Also include the integration root so the ``tools`` symlink (which
    # lives at ``langroid/tools`` → ``../../shared/python/tools``) is
    # importable — mirrors the ``PYTHONPATH=".:src:..."`` that the CI
    # workflow and ``package.json`` dev script both set.
    pkg_root = Path(__file__).resolve().parents[2]
    src_dir = pkg_root / "src"
    existing_pp = env.get("PYTHONPATH", "")
    new_pp = f"{pkg_root}{os.pathsep}{src_dir}"
    env["PYTHONPATH"] = f"{new_pp}{os.pathsep}{existing_pp}" if existing_pp else new_pp

    # Run the import from ``tmp_path`` so any stray ``.env`` file in the
    # project root isn't auto-loaded by ``dotenv.load_dotenv`` (which would
    # reintroduce OPENAI_* silently and mask a regression).
    result = subprocess.run(
        [sys.executable, "-c", "import agents.agent"],
        env=env,
        cwd=str(tmp_path),
        capture_output=True,
        text=True,
    )
    assert result.returncode == 0, (
        f"import agents.agent failed in clean subprocess:\n"
        f"stdout: {result.stdout}\nstderr: {result.stderr}"
    )


# ---------------------------------------------------------------------------
# GenerateA2UITool.handle delegates to generate_a2ui_via_llm
# ---------------------------------------------------------------------------


def test_generate_a2ui_tool_handle_returns_json_str_of_operations():
    """``GenerateA2UITool.handle`` is what langroid invokes server-side. It
    must return a JSON string of whatever ``generate_a2ui_via_llm`` returned
    (a2ui_operations dict on success, or an error dict on failure)."""
    happy_result = {"a2ui_operations": [{"type": "create_surface"}]}
    with patch("agents.agent.generate_a2ui_via_llm", return_value=happy_result) as stub:
        tool = GenerateA2UITool(context="whatever")
        out = tool.handle()
    stub.assert_called_once_with(context="whatever")
    assert isinstance(out, str), (
        f"handle() must return str for langroid's tool framework; got "
        f"{type(out).__name__}"
    )
    parsed = json.loads(out)
    assert parsed == happy_result


def test_generate_a2ui_tool_handle_surfaces_error_dicts_verbatim():
    """Errors from generate_a2ui_via_llm must be serialized to JSON verbatim
    so the frontend / outer LLM can show the structured error."""
    err = {"error": "a2ui_llm_error", "message": "x", "remediation": "y"}
    with patch("agents.agent.generate_a2ui_via_llm", return_value=err):
        tool = GenerateA2UITool(context="")
        out = tool.handle()
    assert isinstance(out, str)
    parsed = json.loads(out)
    assert parsed == err


def test_generate_a2ui_tool_handle_wraps_json_dumps_failure():
    """If ``generate_a2ui_via_llm`` returns something with a non-JSON-serializable
    value (e.g. a ``set`` leaked in from an upstream bug), ``handle()``
    must NOT propagate the ``TypeError`` to the langroid tool framework.
    Instead it emits a JSON-encoded structured error string so the outer
    agent sees a recognizable failure shape.

    Uses ``{1, 2, 3}`` (a set) because it is unambiguously non-JSON-
    serializable and — unlike ``datetime.utcnow()`` — does not depend on a
    deprecated stdlib API.
    """
    unserializable = {"a2ui_operations": [{"payload": {1, 2, 3}}]}
    with patch("agents.agent.generate_a2ui_via_llm", return_value=unserializable):
        tool = GenerateA2UITool(context="")
        out = tool.handle()
    # Must still be a str that json.loads accepts.
    parsed = json.loads(out)