From 15e49d7c397d0d7ccb4845b9984d8f8b79ed98a2 Mon Sep 17 00:00:00 2001 From: Dominique Broeglin Date: Sat, 27 Jun 2026 12:02:27 +0200 Subject: [PATCH] Fail fast on unavailable Pier backend Add a Pier backend preflight so Docker setup failures stop before empty jobs are recorded. Improve analyze diagnostics for Pier trials that fail before Copilot logs are captured. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- README.md | 4 +- docs/authoring-experiments.md | 4 ++ docs/results-format.md | 3 +- src/copilot_experiments/cli.py | 37 +++++++++++ src/copilot_experiments/pier_backend.py | 62 ++++++++++++++++++ src/copilot_experiments/pier_results.py | 33 ++++++++++ tests/test_pier_backend.py | 83 +++++++++++++++++++++++++ tests/test_pier_results.py | 62 ++++++++++++++++++ 8 files changed, 286 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ff7a27e..1e440d9 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,9 @@ current local checkout into uv's cache. If you are iterating on the tool and nee rebuild from the working tree, add `--no-cache` before `--from`. Real runs require Copilot auth (`COPILOT_GITHUB_TOKEN`, `GH_TOKEN`, `GITHUB_TOKEN`, or `gh auth -login`) and a Pier-supported execution backend such as Docker. +login`) and a Pier-supported execution backend such as Docker. `run` preflights the selected +backend before creating a job; for Docker this checks the CLI, Compose plugin, and daemon +connection so missing WSL integration fails before an empty Pier job is recorded. Each `run` is a new measurement. If the configured Pier `job_name` already exists under `jobs/`, `copilot-experiments` writes the rerun to a timestamped job name instead of silently reusing the diff --git a/docs/authoring-experiments.md b/docs/authoring-experiments.md index 8d12433..24b6cb8 100644 --- a/docs/authoring-experiments.md +++ b/docs/authoring-experiments.md @@ -165,6 +165,10 @@ rebuild from the working tree, add `--no-cache` before `--from`. Python experiment path still has an ephemeral mock dry-run, but Pier is the primary authoring model. +`run` performs a lightweight backend preflight before Pier creates a job. For the default Docker +backend it verifies that `docker`, `docker compose`, and the Docker daemon are reachable; this catches +common WSL/Docker Desktop integration issues before a trial can fail without Copilot logs. + Pier itself resumes existing matching job directories and skips trials that already have `result.json`. `copilot-experiments run` treats a plain rerun as a fresh measurement instead: when `jobs//` already exists, it appends a timestamp to the Pier job name for the new run. Pass diff --git a/docs/results-format.md b/docs/results-format.md index 4a3eced..e81e98a 100644 --- a/docs/results-format.md +++ b/docs/results-format.md @@ -89,4 +89,5 @@ uv run copilot-experiments analyze --file jobs///agent/copilot-sessi If the selected Pier trial has no native Copilot `events.jsonl`, `analyze` falls back to `agent/trajectory.json` when present; otherwise it reports that no Copilot session log or -trajectory is available. +trajectory is available. When Pier recorded a trial exception before the agent ran, `analyze` +includes that harness error and points at the trial `result.json`. diff --git a/src/copilot_experiments/cli.py b/src/copilot_experiments/cli.py index 21d9ed0..fe858b5 100644 --- a/src/copilot_experiments/cli.py +++ b/src/copilot_experiments/cli.py @@ -19,12 +19,16 @@ from .index import reindex as index_reindex from .models import DryRunReport, Experiment, ExperimentRun from .pier_backend import ( + PierBackendPreflightError, discover_pier_job_configs, inject_copilot_token, + preflight_pier_backend, prepare_pier_job_for_run, run_pier_job, ) from .pier_results import ( + describe_missing_pier_analysis_source, + iter_pier_trial_summaries, resolve_pier_trial_analysis_source, write_pier_summary, ) @@ -294,6 +298,13 @@ def run( console.print("[green]Pier config validation OK[/green] [dim]— no job was run[/dim]") raise typer.Exit(0) + try: + for spec in pier_specs: + preflight_pier_backend(spec.config) + except PierBackendPreflightError as exc: + err.print(f"[red]Pier backend preflight failed:[/red] {exc}") + raise typer.Exit(1) from exc + try: auth = preflight_github_token() except AuthError as exc: @@ -322,6 +333,7 @@ def run( continue summary = write_pier_summary(run_result.job_dir) _print_run_summary(summary) + _warn_failed_pier_trials(run_result.job_dir) if summary.get("status") != "completed": any_failures = True console.print(f"[dim]results:[/dim] {run_result.job_dir}\n") @@ -633,6 +645,9 @@ def analyze( ) if source_path is None: err.print(f"[red]No Copilot session log or trajectory found in[/red] {pier_job}") + diagnostic = describe_missing_pier_analysis_source(pier_job, trial) + if diagnostic: + err.print(f"[yellow]{diagnostic}[/yellow]") raise typer.Exit(1) selected_otel = otel_file or discovered_otel analysis = ( @@ -821,6 +836,28 @@ def _warn_failed_trials(layout: Layout, experiment: Experiment, run: ExperimentR err.print(f"[yellow]{line}[/yellow]") +def _warn_failed_pier_trials(job_dir: Path) -> None: + """Point Pier harness failures at the trial result artifact.""" + + problems: list[str] = [] + for trial in iter_pier_trial_summaries(job_dir): + if trial.get("status") == "ok": + continue + trial_name = str(trial.get("trial_name") or trial.get("trial_no") or "-") + problems.append( + f" {trial_name}: harness failure — {trial.get('error') or trial.get('status')}\n" + f" -> {job_dir / trial_name / 'result.json'}" + ) + if not problems: + return + err.print( + f"[yellow]Warning:[/yellow] Pier job [bold]{job_dir.name}[/bold] had " + f"{len(problems)} harness failure(s). Inspect the captured trial result:" + ) + for line in problems: + err.print(f"[yellow]{line}[/yellow]") + + def _inspect_pier_job(job_dir: Path) -> None: summary = write_pier_summary(job_dir) console.print(f"[bold]Pier job[/bold]: {job_dir.name}") diff --git a/src/copilot_experiments/pier_backend.py b/src/copilot_experiments/pier_backend.py index b264bcd..3d6afa2 100644 --- a/src/copilot_experiments/pier_backend.py +++ b/src/copilot_experiments/pier_backend.py @@ -4,6 +4,8 @@ import asyncio import json +import shutil +import subprocess from dataclasses import dataclass from datetime import datetime from pathlib import Path @@ -49,6 +51,10 @@ def renamed(self) -> bool: return self.requested_name != self.run_name +class PierBackendPreflightError(RuntimeError): + """A Pier execution backend is not available before a job starts.""" + + def discover_pier_job_configs(root: Path, name: str | None = None) -> list[PierJobSpec]: """Load Pier JobConfig files from ``experiments/*.yaml|*.yml|*.json``.""" @@ -120,6 +126,14 @@ def normalize_pier_job_config(config: Any, *, root: Path, base_dir: Path) -> Any return config +def preflight_pier_backend(config: Any) -> None: + """Fail fast when the configured Pier backend is not usable locally.""" + + backend = _environment_type(config) + if backend == "docker": + _preflight_docker_backend() + + def inject_copilot_token(config: Any, token: str) -> None: """Inject a GitHub token into local Copilot agents without persisting it to config.""" @@ -189,3 +203,51 @@ def _resolve_path(path: Path, base: Path) -> Path: def _job_dir(config: Any) -> Path: return Path(config.jobs_dir) / str(config.job_name) + + +def _environment_type(config: Any) -> str: + environment = getattr(config, "environment", None) + value = getattr(environment, "type", None) + return str(getattr(value, "value", value) or "").lower() + + +def _preflight_docker_backend() -> None: + if shutil.which("docker") is None: + raise PierBackendPreflightError( + "Pier is configured to use the Docker backend, but the 'docker' CLI was not found. " + "Install Docker or enable Docker Desktop WSL integration, then retry." + ) + + _run_backend_probe( + ["docker", "compose", "version"], + "Docker Compose is required by Pier's Docker backend but is not available.", + ) + _run_backend_probe( + ["docker", "info"], + "Docker is installed, but the daemon is not reachable.", + ) + + +def _run_backend_probe(command: list[str], failure_message: str) -> None: + try: + proc = subprocess.run( + command, + capture_output=True, + text=True, + timeout=20, + ) + except OSError as exc: + raise PierBackendPreflightError(f"{failure_message} ({exc})") from exc + except subprocess.TimeoutExpired as exc: + raise PierBackendPreflightError( + f"{failure_message} Probe timed out after {exc.timeout} seconds: {' '.join(command)}" + ) from exc + + if proc.returncode == 0: + return + + detail = "\n".join(part for part in (proc.stderr.strip(), proc.stdout.strip()) if part) + suffix = f"\n{detail}" if detail else "" + raise PierBackendPreflightError( + f"{failure_message} Command failed: {' '.join(command)}{suffix}" + ) diff --git a/src/copilot_experiments/pier_results.py b/src/copilot_experiments/pier_results.py index 6bc4c58..6ddca31 100644 --- a/src/copilot_experiments/pier_results.py +++ b/src/copilot_experiments/pier_results.py @@ -139,6 +139,39 @@ def resolve_pier_trial_analysis_source( return None, label, None, None +def describe_missing_pier_analysis_source( + job_dir: Path, trial: int | str | None = None +) -> str | None: + """Explain why a Pier trial has no Copilot-native analysis source.""" + + trial_dir = _resolve_trial_dir(job_dir, trial) + if trial_dir is None: + return "No Pier trial directories with result.json were found." + + result_path = trial_dir / "result.json" + if not result_path.exists(): + return f"Selected Pier trial has no result.json: {result_path}" + + trial_result = read_json(result_path) + exception = trial_result.get("exception_info") or {} + if exception: + message = ( + exception.get("exception_message") + or exception.get("message") + or exception.get("exception_type") + or str(exception) + ) + return ( + "Selected Pier trial failed before a Copilot session was captured: " + f"{message}\nInspect: {result_path}" + ) + + return ( + "Selected Pier trial completed without a native Copilot session log or ATIF trajectory.\n" + f"Inspect: {trial_dir / 'agent'}" + ) + + def _resolve_trial_dir(job_dir: Path, trial: int | str | None = None) -> Path | None: trial_dirs = iter_trial_dirs(job_dir) if not trial_dirs: diff --git a/tests/test_pier_backend.py b/tests/test_pier_backend.py index 95542ba..23ad902 100644 --- a/tests/test_pier_backend.py +++ b/tests/test_pier_backend.py @@ -4,14 +4,19 @@ from datetime import datetime from pathlib import Path +from subprocess import CompletedProcess import pytest +from typer.testing import CliRunner +from copilot_experiments.cli import app from copilot_experiments.pier_backend import ( COPILOT_CLI_AGENT_IMPORT_PATH, + PierBackendPreflightError, discover_pier_job_configs, inject_copilot_token, load_pier_job_config, + preflight_pier_backend, prepare_pier_job_for_run, ) @@ -119,6 +124,84 @@ def test_prepare_pier_job_for_run_resume_keeps_existing_name(tmp_path: Path): assert not prepared.renamed +def test_preflight_pier_backend_reports_missing_docker( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +): + config_path = tmp_path / "job.yaml" + config_path.write_text("job_name: smoke\n", encoding="utf-8") + config = load_pier_job_config(config_path, root=tmp_path) + monkeypatch.setattr("copilot_experiments.pier_backend.shutil.which", lambda _name: None) + + with pytest.raises(PierBackendPreflightError, match="docker.*not found"): + preflight_pier_backend(config) + + +def test_preflight_pier_backend_checks_docker_compose_and_daemon( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +): + config_path = tmp_path / "job.yaml" + config_path.write_text("job_name: smoke\n", encoding="utf-8") + config = load_pier_job_config(config_path, root=tmp_path) + calls: list[list[str]] = [] + + def fake_run(command: list[str], **_kwargs): + calls.append(command) + return CompletedProcess(command, 0, stdout="ok", stderr="") + + monkeypatch.setattr("copilot_experiments.pier_backend.shutil.which", lambda _name: "docker") + monkeypatch.setattr("copilot_experiments.pier_backend.subprocess.run", fake_run) + + preflight_pier_backend(config) + + assert calls == [["docker", "compose", "version"], ["docker", "info"]] + + +def test_preflight_pier_backend_reports_unreachable_docker_daemon( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +): + config_path = tmp_path / "job.yaml" + config_path.write_text("job_name: smoke\n", encoding="utf-8") + config = load_pier_job_config(config_path, root=tmp_path) + + def fake_run(command: list[str], **_kwargs): + if command == ["docker", "info"]: + return CompletedProcess(command, 1, stdout="", stderr="Cannot connect to Docker daemon") + return CompletedProcess(command, 0, stdout="ok", stderr="") + + monkeypatch.setattr("copilot_experiments.pier_backend.shutil.which", lambda _name: "docker") + monkeypatch.setattr("copilot_experiments.pier_backend.subprocess.run", fake_run) + + with pytest.raises(PierBackendPreflightError, match="Cannot connect to Docker daemon"): + preflight_pier_backend(config) + + +def test_cli_run_fails_before_auth_when_pier_backend_preflight_fails( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +): + experiments = tmp_path / "experiments" + experiments.mkdir() + (experiments / "job.yaml").write_text("job_name: smoke\n", encoding="utf-8") + + def fail_preflight(_config): + raise PierBackendPreflightError("Docker is unavailable") + + def fail_if_called(): + pytest.fail("auth should not run after backend preflight failure") + + monkeypatch.setattr("copilot_experiments.cli.preflight_pier_backend", fail_preflight) + monkeypatch.setattr("copilot_experiments.cli.preflight_github_token", fail_if_called) + monkeypatch.setattr( + "copilot_experiments.cli.run_pier_job", + lambda _config: pytest.fail("Pier job should not start after backend preflight failure"), + ) + + result = CliRunner().invoke(app, ["run", "--root", str(tmp_path)]) + + assert result.exit_code == 1 + assert "Pier backend preflight failed" in result.output + assert "Docker is unavailable" in result.output + + @pytest.mark.parametrize( "example_root", [ diff --git a/tests/test_pier_results.py b/tests/test_pier_results.py index 0510481..d08b46e 100644 --- a/tests/test_pier_results.py +++ b/tests/test_pier_results.py @@ -11,6 +11,7 @@ from copilot_experiments.index import connect, index_pier_job_dir from copilot_experiments.pier_results import ( build_pier_summary, + describe_missing_pier_analysis_source, resolve_pier_trial_events, write_pier_summary, ) @@ -165,6 +166,40 @@ def _make_pier_job_with_trajectory(job_dir: Path) -> Path: return job_dir +def _make_pier_job_with_harness_error(job_dir: Path) -> Path: + _write_json( + job_dir / "config.json", + {"job_name": "demo-job"}, + ) + _write_json( + job_dir / "result.json", + { + "started_at": "2026-01-01T00:00:00Z", + "finished_at": "2026-01-01T00:00:05Z", + "stats": {"n_errored_trials": 1}, + }, + ) + trial = job_dir / "copilot-cli__textstats__1" + _write_json( + trial / "result.json", + { + "trial_name": "copilot-cli__textstats__1", + "task_name": "textstats", + "started_at": "2026-01-01T00:00:00Z", + "finished_at": "2026-01-01T00:00:05Z", + "agent_info": { + "name": "copilot-cli", + "model_info": {"name": "gpt-5-mini"}, + }, + "exception_info": { + "exception_type": "RuntimeError", + "exception_message": "Docker daemon unavailable", + }, + }, + ) + return job_dir + + def test_build_pier_summary_reads_native_copilot_events(tmp_path: Path): job_dir = _make_pier_job(tmp_path / "jobs" / "demo-job") @@ -202,6 +237,17 @@ def test_build_pier_summary_reads_trajectory_when_native_events_are_absent(tmp_p assert variant["avg_output_tokens"] == 7.0 +def test_describe_missing_pier_analysis_source_explains_harness_error(tmp_path: Path): + job_dir = _make_pier_job_with_harness_error(tmp_path / "jobs" / "demo-job") + + diagnostic = describe_missing_pier_analysis_source(job_dir) + + assert diagnostic is not None + assert "failed before a Copilot session was captured" in diagnostic + assert "Docker daemon unavailable" in diagnostic + assert "result.json" in diagnostic + + def test_cli_analyze_reads_pier_job_events(tmp_path: Path): _make_pier_job(tmp_path / "jobs" / "demo-job") runner = CliRunner() @@ -232,6 +278,22 @@ def test_cli_analyze_reads_pier_job_trajectory_when_events_are_absent(tmp_path: assert "view" in result.output +def test_cli_analyze_reports_pier_harness_error_when_logs_are_absent(tmp_path: Path): + _make_pier_job_with_harness_error(tmp_path / "jobs" / "demo-job") + runner = CliRunner() + + result = runner.invoke( + app, + ["analyze", "demo-job", "--root", str(tmp_path), "--trial", "1"], + ) + + assert result.exit_code == 1 + assert "No Copilot session log or trajectory found" in result.output + assert "failed before a Copilot session was captured" in result.output + assert "Docker daemon" in result.output + assert "unavailable" in result.output + + def test_write_pier_summary_and_index(tmp_path: Path): job_dir = _make_pier_job(tmp_path / "jobs" / "demo-job")