Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,9 @@ current local checkout into uv's cache. If you are iterating on the tool and nee
rebuild from the working tree, add `--no-cache` before `--from`.

Real runs require Copilot auth (`COPILOT_GITHUB_TOKEN`, `GH_TOKEN`, `GITHUB_TOKEN`, or `gh auth
login`) and a Pier-supported execution backend such as Docker.
login`) and a Pier-supported execution backend such as Docker. `run` preflights the selected
backend before creating a job; for Docker this checks the CLI, Compose plugin, and daemon
connection so missing WSL integration fails before an empty Pier job is recorded.

Each `run` is a new measurement. If the configured Pier `job_name` already exists under `jobs/`,
`copilot-experiments` writes the rerun to a timestamped job name instead of silently reusing the
Expand Down
4 changes: 4 additions & 0 deletions docs/authoring-experiments.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,10 @@ rebuild from the working tree, add `--no-cache` before `--from`.
Python experiment path still has an ephemeral mock dry-run, but Pier is the primary authoring
model.

`run` performs a lightweight backend preflight before Pier creates a job. For the default Docker
backend it verifies that `docker`, `docker compose`, and the Docker daemon are reachable; this catches
common WSL/Docker Desktop integration issues before a trial can fail without Copilot logs.

Pier itself resumes existing matching job directories and skips trials that already have
`result.json`. `copilot-experiments run` treats a plain rerun as a fresh measurement instead: when
`jobs/<job_name>/` already exists, it appends a timestamp to the Pier job name for the new run. Pass
Expand Down
3 changes: 2 additions & 1 deletion docs/results-format.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,4 +89,5 @@ uv run copilot-experiments analyze --file jobs/<job>/<trial>/agent/copilot-sessi

If the selected Pier trial has no native Copilot `events.jsonl`, `analyze` falls back to
`agent/trajectory.json` when present; otherwise it reports that no Copilot session log or
trajectory is available.
trajectory is available. When Pier recorded a trial exception before the agent ran, `analyze`
includes that harness error and points at the trial `result.json`.
37 changes: 37 additions & 0 deletions src/copilot_experiments/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,16 @@
from .index import reindex as index_reindex
from .models import DryRunReport, Experiment, ExperimentRun
from .pier_backend import (
PierBackendPreflightError,
discover_pier_job_configs,
inject_copilot_token,
preflight_pier_backend,
prepare_pier_job_for_run,
run_pier_job,
)
from .pier_results import (
describe_missing_pier_analysis_source,
iter_pier_trial_summaries,
resolve_pier_trial_analysis_source,
write_pier_summary,
)
Expand Down Expand Up @@ -294,6 +298,13 @@ def run(
console.print("[green]Pier config validation OK[/green] [dim]— no job was run[/dim]")
raise typer.Exit(0)

try:
for spec in pier_specs:
preflight_pier_backend(spec.config)
except PierBackendPreflightError as exc:
err.print(f"[red]Pier backend preflight failed:[/red] {exc}")
raise typer.Exit(1) from exc

try:
auth = preflight_github_token()
except AuthError as exc:
Expand Down Expand Up @@ -322,6 +333,7 @@ def run(
continue
summary = write_pier_summary(run_result.job_dir)
_print_run_summary(summary)
_warn_failed_pier_trials(run_result.job_dir)
if summary.get("status") != "completed":
any_failures = True
console.print(f"[dim]results:[/dim] {run_result.job_dir}\n")
Expand Down Expand Up @@ -633,6 +645,9 @@ def analyze(
)
if source_path is None:
err.print(f"[red]No Copilot session log or trajectory found in[/red] {pier_job}")
diagnostic = describe_missing_pier_analysis_source(pier_job, trial)
if diagnostic:
err.print(f"[yellow]{diagnostic}[/yellow]")
raise typer.Exit(1)
selected_otel = otel_file or discovered_otel
analysis = (
Expand Down Expand Up @@ -821,6 +836,28 @@ def _warn_failed_trials(layout: Layout, experiment: Experiment, run: ExperimentR
err.print(f"[yellow]{line}[/yellow]")


def _warn_failed_pier_trials(job_dir: Path) -> None:
"""Point Pier harness failures at the trial result artifact."""

problems: list[str] = []
for trial in iter_pier_trial_summaries(job_dir):
if trial.get("status") == "ok":
continue
trial_name = str(trial.get("trial_name") or trial.get("trial_no") or "-")
problems.append(
f" {trial_name}: harness failure — {trial.get('error') or trial.get('status')}\n"
f" -> {job_dir / trial_name / 'result.json'}"
)
if not problems:
return
err.print(
f"[yellow]Warning:[/yellow] Pier job [bold]{job_dir.name}[/bold] had "
f"{len(problems)} harness failure(s). Inspect the captured trial result:"
)
for line in problems:
err.print(f"[yellow]{line}[/yellow]")


def _inspect_pier_job(job_dir: Path) -> None:
summary = write_pier_summary(job_dir)
console.print(f"[bold]Pier job[/bold]: {job_dir.name}")
Expand Down
62 changes: 62 additions & 0 deletions src/copilot_experiments/pier_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@

import asyncio
import json
import shutil
import subprocess
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
Expand Down Expand Up @@ -49,6 +51,10 @@ def renamed(self) -> bool:
return self.requested_name != self.run_name


class PierBackendPreflightError(RuntimeError):
"""A Pier execution backend is not available before a job starts."""


def discover_pier_job_configs(root: Path, name: str | None = None) -> list[PierJobSpec]:
"""Load Pier JobConfig files from ``experiments/*.yaml|*.yml|*.json``."""

Expand Down Expand Up @@ -120,6 +126,14 @@ def normalize_pier_job_config(config: Any, *, root: Path, base_dir: Path) -> Any
return config


def preflight_pier_backend(config: Any) -> None:
"""Fail fast when the configured Pier backend is not usable locally."""

backend = _environment_type(config)
if backend == "docker":
_preflight_docker_backend()


def inject_copilot_token(config: Any, token: str) -> None:
"""Inject a GitHub token into local Copilot agents without persisting it to config."""

Expand Down Expand Up @@ -189,3 +203,51 @@ def _resolve_path(path: Path, base: Path) -> Path:

def _job_dir(config: Any) -> Path:
return Path(config.jobs_dir) / str(config.job_name)


def _environment_type(config: Any) -> str:
environment = getattr(config, "environment", None)
value = getattr(environment, "type", None)
return str(getattr(value, "value", value) or "").lower()


def _preflight_docker_backend() -> None:
if shutil.which("docker") is None:
raise PierBackendPreflightError(
"Pier is configured to use the Docker backend, but the 'docker' CLI was not found. "
"Install Docker or enable Docker Desktop WSL integration, then retry."
)

_run_backend_probe(
["docker", "compose", "version"],
"Docker Compose is required by Pier's Docker backend but is not available.",
)
_run_backend_probe(
["docker", "info"],
"Docker is installed, but the daemon is not reachable.",
)


def _run_backend_probe(command: list[str], failure_message: str) -> None:
try:
proc = subprocess.run(
command,
capture_output=True,
text=True,
timeout=20,
)
except OSError as exc:
raise PierBackendPreflightError(f"{failure_message} ({exc})") from exc
except subprocess.TimeoutExpired as exc:
raise PierBackendPreflightError(
f"{failure_message} Probe timed out after {exc.timeout} seconds: {' '.join(command)}"
) from exc

if proc.returncode == 0:
return

detail = "\n".join(part for part in (proc.stderr.strip(), proc.stdout.strip()) if part)
suffix = f"\n{detail}" if detail else ""
raise PierBackendPreflightError(
f"{failure_message} Command failed: {' '.join(command)}{suffix}"
)
33 changes: 33 additions & 0 deletions src/copilot_experiments/pier_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,39 @@ def resolve_pier_trial_analysis_source(
return None, label, None, None


def describe_missing_pier_analysis_source(
job_dir: Path, trial: int | str | None = None
) -> str | None:
"""Explain why a Pier trial has no Copilot-native analysis source."""

trial_dir = _resolve_trial_dir(job_dir, trial)
if trial_dir is None:
return "No Pier trial directories with result.json were found."

result_path = trial_dir / "result.json"
if not result_path.exists():
return f"Selected Pier trial has no result.json: {result_path}"

trial_result = read_json(result_path)
exception = trial_result.get("exception_info") or {}
if exception:
message = (
exception.get("exception_message")
or exception.get("message")
or exception.get("exception_type")
or str(exception)
)
return (
"Selected Pier trial failed before a Copilot session was captured: "
f"{message}\nInspect: {result_path}"
)

return (
"Selected Pier trial completed without a native Copilot session log or ATIF trajectory.\n"
f"Inspect: {trial_dir / 'agent'}"
)


def _resolve_trial_dir(job_dir: Path, trial: int | str | None = None) -> Path | None:
trial_dirs = iter_trial_dirs(job_dir)
if not trial_dirs:
Expand Down
83 changes: 83 additions & 0 deletions tests/test_pier_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,19 @@

from datetime import datetime
from pathlib import Path
from subprocess import CompletedProcess

import pytest
from typer.testing import CliRunner

from copilot_experiments.cli import app
from copilot_experiments.pier_backend import (
COPILOT_CLI_AGENT_IMPORT_PATH,
PierBackendPreflightError,
discover_pier_job_configs,
inject_copilot_token,
load_pier_job_config,
preflight_pier_backend,
prepare_pier_job_for_run,
)

Expand Down Expand Up @@ -119,6 +124,84 @@ def test_prepare_pier_job_for_run_resume_keeps_existing_name(tmp_path: Path):
assert not prepared.renamed


def test_preflight_pier_backend_reports_missing_docker(
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
):
config_path = tmp_path / "job.yaml"
config_path.write_text("job_name: smoke\n", encoding="utf-8")
config = load_pier_job_config(config_path, root=tmp_path)
monkeypatch.setattr("copilot_experiments.pier_backend.shutil.which", lambda _name: None)

with pytest.raises(PierBackendPreflightError, match="docker.*not found"):
preflight_pier_backend(config)


def test_preflight_pier_backend_checks_docker_compose_and_daemon(
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
):
config_path = tmp_path / "job.yaml"
config_path.write_text("job_name: smoke\n", encoding="utf-8")
config = load_pier_job_config(config_path, root=tmp_path)
calls: list[list[str]] = []

def fake_run(command: list[str], **_kwargs):
calls.append(command)
return CompletedProcess(command, 0, stdout="ok", stderr="")

monkeypatch.setattr("copilot_experiments.pier_backend.shutil.which", lambda _name: "docker")
monkeypatch.setattr("copilot_experiments.pier_backend.subprocess.run", fake_run)

preflight_pier_backend(config)

assert calls == [["docker", "compose", "version"], ["docker", "info"]]


def test_preflight_pier_backend_reports_unreachable_docker_daemon(
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
):
config_path = tmp_path / "job.yaml"
config_path.write_text("job_name: smoke\n", encoding="utf-8")
config = load_pier_job_config(config_path, root=tmp_path)

def fake_run(command: list[str], **_kwargs):
if command == ["docker", "info"]:
return CompletedProcess(command, 1, stdout="", stderr="Cannot connect to Docker daemon")
return CompletedProcess(command, 0, stdout="ok", stderr="")

monkeypatch.setattr("copilot_experiments.pier_backend.shutil.which", lambda _name: "docker")
monkeypatch.setattr("copilot_experiments.pier_backend.subprocess.run", fake_run)

with pytest.raises(PierBackendPreflightError, match="Cannot connect to Docker daemon"):
preflight_pier_backend(config)


def test_cli_run_fails_before_auth_when_pier_backend_preflight_fails(
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
):
experiments = tmp_path / "experiments"
experiments.mkdir()
(experiments / "job.yaml").write_text("job_name: smoke\n", encoding="utf-8")

def fail_preflight(_config):
raise PierBackendPreflightError("Docker is unavailable")

def fail_if_called():
pytest.fail("auth should not run after backend preflight failure")

monkeypatch.setattr("copilot_experiments.cli.preflight_pier_backend", fail_preflight)
monkeypatch.setattr("copilot_experiments.cli.preflight_github_token", fail_if_called)
monkeypatch.setattr(
"copilot_experiments.cli.run_pier_job",
lambda _config: pytest.fail("Pier job should not start after backend preflight failure"),
)

result = CliRunner().invoke(app, ["run", "--root", str(tmp_path)])

assert result.exit_code == 1
assert "Pier backend preflight failed" in result.output
assert "Docker is unavailable" in result.output


@pytest.mark.parametrize(
"example_root",
[
Expand Down
Loading
Loading