From 2543699f316e2f722d939f9a6bc96ba805fd50f5 Mon Sep 17 00:00:00 2001 From: Dominique Broeglin Date: Sun, 28 Jun 2026 02:43:40 +0200 Subject: [PATCH 1/3] Nest Pier runs and make run selectors discoverable Replace the awkward flat-then-suffixed Pier rerun scheme with a clean nested jobs/// layout that separates stable job identity from per-run timestamp ids. Expose copyable job-name/run-id selectors via list, and let show, inspect, and nalyze resolve an exact run from a selector (legacy run id, Pier job, or job/run). Add actionable not-found hints pointing back at list. Persist stable job identity in a per-run manifest, derive selectors for the index, and update docs, experiment-repo templates, and ADR-0019. Commit a didactic project-scoped canvas that visualizes the verified structure. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../extension.mjs | 892 ++++++++++++++++++ README.md | 26 +- .../0019-use-nested-pier-run-directories.md | 51 + docs/adr/README.md | 1 + docs/analysis.md | 8 +- docs/architecture.md | 4 +- docs/authoring-experiments.md | 11 +- docs/collecting-run-data.md | 38 +- docs/deepswe.md | 1 + docs/results-format.md | 59 +- src/copilot_experiments/cli.py | 250 +++-- src/copilot_experiments/index.py | 43 +- src/copilot_experiments/pier_backend.py | 65 +- src/copilot_experiments/pier_results.py | 58 +- src/copilot_experiments/storage.py | 87 +- .../.apm/skills/analyzing-results/SKILL.md | 6 +- .../templates/experiment_repo/AGENTS.md.tmpl | 4 +- .../templates/experiment_repo/README.md.tmpl | 17 +- tests/test_pier_backend.py | 53 +- tests/test_pier_results.py | 75 +- tests/test_storage.py | 24 +- 21 files changed, 1548 insertions(+), 225 deletions(-) create mode 100644 .github/extensions/experiment-repository-structure/extension.mjs create mode 100644 docs/adr/0019-use-nested-pier-run-directories.md diff --git a/.github/extensions/experiment-repository-structure/extension.mjs b/.github/extensions/experiment-repository-structure/extension.mjs new file mode 100644 index 0000000..d942782 --- /dev/null +++ b/.github/extensions/experiment-repository-structure/extension.mjs @@ -0,0 +1,892 @@ +import { createServer } from "node:http"; +import { createCanvas, joinSession } from "@github/copilot-sdk/extension"; + +const servers = new Map(); + +const structure = [ + { + id: "repo", + parent: null, + label: "Experiment repository", + path: ".", + kind: "root", + badge: "workspace", + owner: "human + harness", + source: "The git checkout that contains experiment definitions and generated outputs.", + why: "Separates experiment authoring from the copilot-experiments harness repository.", + commands: ["copilot-experiments list", "copilot-experiments run --dry-run"], + }, + { + id: "experiments", + parent: "repo", + label: "experiments/", + path: "experiments/", + kind: "source", + badge: "committed", + owner: "experiment author", + source: "Pier JobConfig YAML files.", + why: "Defines what to run: tasks, agents, model settings, attempts, concurrency, and job_name.", + commands: ["copilot-experiments run --dry-run", "copilot-experiments run [job-name]"], + }, + { + id: "job-yaml", + parent: "experiments", + label: ".yaml", + path: "experiments/.yaml", + kind: "source", + badge: "committed", + owner: "experiment author", + source: "Stable Pier job configuration.", + why: "The job_name is the durable experiment identity. Repeated executions keep this identity but get new run ids.", + commands: ["copilot-experiments run ", "copilot-experiments run --resume "], + }, + { + id: "tasks", + parent: "repo", + label: "tasks/", + path: "tasks/", + kind: "task", + badge: "committed", + owner: "experiment author", + source: "Harbor/Pier task directories or imported task corpora.", + why: "Keeps task instructions, environment setup, and verifiers close to the experiment repo.", + commands: ["copilot-experiments deepswe-import ", "copilot-experiments run --dry-run"], + }, + { + id: "task-dir", + parent: "tasks", + label: "/", + path: "tasks//", + kind: "task", + badge: "committed", + owner: "experiment author", + source: "One task's prompt, environment, and verifier.", + why: "A Pier job can point to individual tasks or datasets of many tasks.", + commands: ["copilot-experiments run --dry-run"], + }, + { + id: "task-instruction", + parent: "task-dir", + label: "instruction.md", + path: "tasks//instruction.md", + kind: "task", + badge: "committed", + owner: "experiment author", + source: "Prompt text presented to the evaluated agent.", + why: "This is the human-readable task objective.", + commands: [], + }, + { + id: "task-toml", + parent: "task-dir", + label: "task.toml", + path: "tasks//task.toml", + kind: "task", + badge: "committed", + owner: "experiment author", + source: "Pier task metadata.", + why: "Connects instructions, environment, and verifier into a runnable task.", + commands: [], + }, + { + id: "task-env", + parent: "task-dir", + label: "environment/", + path: "tasks//environment/", + kind: "task", + badge: "committed", + owner: "experiment author", + source: "Sandbox setup for the task.", + why: "Gives Pier a reproducible workspace for each trial.", + commands: [], + }, + { + id: "task-tests", + parent: "task-dir", + label: "tests/", + path: "tasks//tests/", + kind: "task", + badge: "committed", + owner: "experiment author", + source: "Verifier inputs or grading scripts.", + why: "Turns an agent patch into an objective success signal.", + commands: [], + }, + { + id: "jobs", + parent: "repo", + label: "jobs/", + path: "jobs/", + kind: "run", + badge: "gitignored", + owner: "Pier + harness", + source: "Generated run outputs. This is now the primary execution tree.", + why: "Keeps measured executions out of git while preserving all data needed to inspect a run.", + commands: ["copilot-experiments list", "copilot-experiments reindex"], + }, + { + id: "job-group", + parent: "jobs", + label: "/", + path: "jobs//", + kind: "run", + badge: "stable identity", + owner: "copilot-experiments", + source: "Grouping directory named from the configured Pier job_name.", + why: "A stable identity can contain many repeated measurements without inventing new job names.", + commands: ["copilot-experiments show ", "copilot-experiments inspect "], + }, + { + id: "run-dir", + parent: "job-group", + label: "/", + path: "jobs///", + kind: "run", + badge: "generated", + owner: "Pier + harness", + source: "One concrete execution, usually timestamped like 20260620-153000.", + why: "This is the copyable run selector: /.", + commands: [ + "copilot-experiments show /", + "copilot-experiments inspect /", + "copilot-experiments analyze / --trial 1", + ], + }, + { + id: "run-manifest", + parent: "run-dir", + label: "copilot-experiments-run.json", + path: "jobs///copilot-experiments-run.json", + kind: "run", + badge: "harness", + owner: "copilot-experiments", + source: "Small manifest with job_name, run_id, and job/run id.", + why: "Pier's config sees the concrete run id as job_name; this manifest preserves the stable job identity.", + commands: ["copilot-experiments reindex"], + }, + { + id: "run-config", + parent: "run-dir", + label: "config.json", + path: "jobs///config.json", + kind: "run", + badge: "Pier", + owner: "Pier", + source: "Resolved Pier job config for this concrete execution.", + why: "Captures exactly what Pier ran after path normalization and agent setup.", + commands: [], + }, + { + id: "run-result", + parent: "run-dir", + label: "result.json", + path: "jobs///result.json", + kind: "run", + badge: "Pier", + owner: "Pier", + source: "Job-level status, timings, and aggregate Pier stats.", + why: "Primary job status signal for show/list/reindex.", + commands: ["copilot-experiments show /"], + }, + { + id: "trial-dir", + parent: "run-dir", + label: "/", + path: "jobs////", + kind: "trial", + badge: "generated", + owner: "Pier", + source: "One agent/task/attempt cell.", + why: "Contains the raw evidence for whether a task was solved and how the agent behaved.", + commands: ["copilot-experiments inspect / --trial 1"], + }, + { + id: "trial-config", + parent: "trial-dir", + label: "config.json", + path: "jobs////config.json", + kind: "trial", + badge: "Pier", + owner: "Pier", + source: "Resolved trial configuration.", + why: "Useful when comparing why two trial cells differ.", + commands: [], + }, + { + id: "trial-result", + parent: "trial-dir", + label: "result.json", + path: "jobs////result.json", + kind: "trial", + badge: "Pier", + owner: "Pier", + source: "Trial status, verifier reward, exceptions, agent info, and timings.", + why: "This is where harness failures and grading results are diagnosed.", + commands: ["copilot-experiments inspect / --trial 1"], + }, + { + id: "agent", + parent: "trial-dir", + label: "agent/", + path: "jobs////agent/", + kind: "analysis", + badge: "agent output", + owner: "copilot-cli agent", + source: "Outputs captured from the evaluated agent.", + why: "Raw agent evidence lives here; summaries are derived from these files.", + commands: ["copilot-experiments analyze / --trial 1"], + }, + { + id: "trajectory", + parent: "agent", + label: "trajectory.json", + path: ".../agent/trajectory.json", + kind: "analysis", + badge: "ATIF", + owner: "copilot-cli agent", + source: "ATIF trajectory emitted by the installed agent.", + why: "Fallback analysis source when native Copilot session events are absent.", + commands: ["copilot-experiments analyze / --trial 1"], + }, + { + id: "cli-jsonl", + parent: "agent", + label: "copilot-cli.jsonl / .txt", + path: ".../agent/copilot-cli.jsonl", + kind: "analysis", + badge: "diagnostic", + owner: "copilot-cli agent", + source: "Raw Copilot CLI output streams.", + why: "Useful for auth, invocation, or startup failures before a session log exists.", + commands: [], + }, + { + id: "otel", + parent: "agent", + label: "copilot-otel.jsonl", + path: ".../agent/copilot-otel.jsonl", + kind: "analysis", + badge: "diagnostic", + owner: "copilot-cli agent", + source: "OpenTelemetry file exporter output for Copilot calls.", + why: "Enriches analysis with per-LLM-call metrics and AIU details.", + commands: ["copilot-experiments analyze --file --otel-file "], + }, + { + id: "session-events", + parent: "agent", + label: "copilot-session/**/events.jsonl", + path: ".../agent/copilot-session//events.jsonl", + kind: "analysis", + badge: "source of truth", + owner: "GitHub Copilot CLI", + source: "Native Copilot CLI session log.", + why: "Primary source for turns, tool calls, tokens, AIU, and rich analysis.", + commands: [ + "copilot-experiments analyze / --trial 1", + "copilot-experiments analyze --file ", + ], + }, + { + id: "verifier", + parent: "trial-dir", + label: "verifier/", + path: "...//verifier/", + kind: "trial", + badge: "Pier", + owner: "Pier verifier", + source: "Verifier outputs, rewards, and grading artifacts.", + why: "Connects agent behavior to the solved/unsolved measurement.", + commands: [], + }, + { + id: "artifacts", + parent: "trial-dir", + label: "artifacts/", + path: "...//artifacts/", + kind: "trial", + badge: "Pier", + owner: "Pier", + source: "Downloaded artifacts requested by the job config.", + why: "Keeps extra run evidence beside the trial that produced it.", + commands: [], + }, + { + id: "summary", + parent: "run-dir", + label: "summary.json / summary.md", + path: "jobs///summary.json", + kind: "derived", + badge: "derived", + owner: "copilot-experiments", + source: "Generated from Pier result files and Copilot-native logs.", + why: "Gives the familiar variant/task aggregate shape for show and reports.", + commands: ["copilot-experiments show /"], + }, + { + id: "results", + parent: "repo", + label: "results/", + path: "results/", + kind: "derived", + badge: "derived", + owner: "copilot-experiments", + source: "Derived index plus legacy Python experiment runs.", + why: "The SQLite index is rebuildable. Legacy run data remains readable during migration.", + commands: ["copilot-experiments reindex"], + }, + { + id: "index-db", + parent: "results", + label: "index.db", + path: "results/index.db", + kind: "derived", + badge: "cache", + owner: "copilot-experiments", + source: "SQLite cache derived from jobs/ and legacy results/.", + why: "Speeds up cross-run queries; never the source of truth.", + commands: ["copilot-experiments reindex"], + }, + { + id: "legacy-results", + parent: "results", + label: "//...", + path: "results///", + kind: "legacy", + badge: "legacy", + owner: "legacy harness", + source: "Older Python Experiment/Task/Variant runs.", + why: "Kept for migration and historical data; new Pier runs use jobs/.", + commands: ["copilot-experiments show ", "copilot-experiments analyze "], + }, + { + id: "guidance", + parent: "repo", + label: "README.md / AGENTS.md / .apm/", + path: "README.md, AGENTS.md, .apm/", + kind: "guidance", + badge: "committed", + owner: "experiment author", + source: "Human and agent guidance for the experiment repo.", + why: "Makes the repo self-explanatory for people and for Copilot agents working inside it.", + commands: ["copilot-experiments list"], + }, +]; + +const flow = [ + ["Author task", "tasks//instruction.md"], + ["Define job", "experiments/.yaml"], + ["Run", "copilot-experiments run"], + ["Concrete output", "jobs///"], + ["Inspect/analyze", "show | inspect | analyze /"], + ["Derived cache", "results/index.db"], +]; + +function htmlEscape(value) { + return String(value) + .replaceAll("&", "&") + .replaceAll("<", "<") + .replaceAll(">", ">") + .replaceAll('"', """); +} + +function renderHtml() { + const data = JSON.stringify(structure).replaceAll("<", "\\u003c"); + const flowData = JSON.stringify(flow).replaceAll("<", "\\u003c"); + return ` + + + + +Experiment repository structure + + + +
+
+

Experiment repository structure

+

A didactic map of a Pier-first copilot-experiments repo. New runs use jobs/<job-name>/<run-id>/, and copilot-experiments list prints the selectors accepted by show, inspect, and analyze.

+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+ + +`; +} + +async function startServer(instanceId) { + const server = createServer((req, res) => { + if (req.url === "/structure.json") { + res.setHeader("Content-Type", "application/json; charset=utf-8"); + res.end(JSON.stringify({ structure, flow })); + return; + } + + res.setHeader("Content-Type", "text/html; charset=utf-8"); + res.end(renderHtml(instanceId)); + }); + await new Promise((resolve) => server.listen(0, "127.0.0.1", resolve)); + const address = server.address(); + const port = typeof address === "object" && address ? address.port : 0; + return { server, url: `http://127.0.0.1:${port}/` }; +} + +await joinSession({ + canvases: [ + createCanvas({ + id: "experiment-repository-structure", + displayName: "Experiment repository structure", + description: "Interactive didactic map of a Pier-first copilot-experiments repository layout.", + actions: [ + { + name: "summarize", + description: "Return a concise summary of the experiment repository structure.", + handler: async () => ({ + layout: "Pier runs live at jobs///.", + selector: "Use copilot-experiments list, then pass job-name/run-id to show, inspect, or analyze.", + sourceOfTruth: "jobs/ and legacy results/ on disk; results/index.db is derived.", + nodes: structure.length, + }), + }, + ], + open: async (ctx) => { + let entry = servers.get(ctx.instanceId); + if (!entry) { + entry = await startServer(ctx.instanceId); + servers.set(ctx.instanceId, entry); + } + return { + title: "Experiment repository structure", + status: "Pier-first layout with copyable run selectors", + url: entry.url, + }; + }, + onClose: async (ctx) => { + const entry = servers.get(ctx.instanceId); + if (entry) { + servers.delete(ctx.instanceId); + await new Promise((resolve) => entry.server.close(() => resolve())); + } + }, + }), + ], +}); diff --git a/README.md b/README.md index 1e440d9..e602e70 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ flowchart LR J --> P["Pier backend\nsandbox + verifier + artifacts"] P --> A["copilot-cli installed agent\nreal copilot binary"] A --> S["native Copilot\ncopilot-session/**/events.jsonl"] - P --> O["jobs//\nPier result.json + trials"] + P --> O["jobs///\nPier result.json + trials"] S --> C["Copilot-native analysis\nAIU, tokens, tools, turns"] O --> R["summary.json / summary.md\nshow / inspect / analyze"] O --> I["results/index.db\nderived SQLite index"] @@ -46,6 +46,7 @@ uv run copilot-experiments run --dry-run # run for real through Pier uv run copilot-experiments run +uv run copilot-experiments list uv run copilot-experiments show --last uv run copilot-experiments analyze --last ``` @@ -60,6 +61,7 @@ export COPILOT_EXPERIMENTS_REPO=/path/to/github-copilot-lab uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments init my-experiments cd my-experiments uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments run --dry-run +uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments list uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments show --last ``` @@ -74,10 +76,14 @@ login`) and a Pier-supported execution backend such as Docker. `run` preflights backend before creating a job; for Docker this checks the CLI, Compose plugin, and daemon connection so missing WSL integration fails before an empty Pier job is recorded. -Each `run` is a new measurement. If the configured Pier `job_name` already exists under `jobs/`, -`copilot-experiments` writes the rerun to a timestamped job name instead of silently reusing the -completed job. Pass `--resume` only when you intentionally want Pier's native resume behavior for an -interrupted job. +Each `run` is a new measurement. The configured Pier `job_name` remains the stable experiment +identity, while each execution gets a timestamped run directory under +`jobs///`. Pass `--resume` only when you intentionally want to reuse the latest +run directory for that job and let Pier skip already-completed trials. + +Use `copilot-experiments list` after a run to copy the selector for a concrete execution. Pier +selectors use `job-name/run-id`; passing just `job-name` selects that job's latest run, while +`--last` selects the most recent stored run overall. ## Bundled examples @@ -96,13 +102,13 @@ uv run copilot-experiments analyze --root examples/tracer_bullet --last | --- | --- | | `init ` | Scaffold a standalone Pier experiment repository. | | `deepswe-import ` | Generate a Pier job config for a cloned DeepSWE checkout, `tasks/` corpus, or single task. | -| `run [name]` | Discover Pier job configs in `experiments/` and run them. Reruns create a fresh timestamped Pier job when the configured name already exists. Falls back to legacy Python experiments when no Pier configs exist. | +| `run [name]` | Discover Pier job configs in `experiments/` and run them. Each run writes to a fresh `jobs///` directory. Falls back to legacy Python experiments when no Pier configs exist. | | `run --dry-run` | Validate Pier job configs, or run the legacy ephemeral mock dry-run for legacy experiments. | | `run --resume` | Resume an existing Pier job directory and skip already-completed matching trials. | -| `list` | List Pier job configs, legacy experiments, and stored jobs/runs. | -| `show ` / `show --last` | Print a summary for a Pier job or legacy run. | -| `analyze ` / `analyze --last` / `analyze --file ` | Render a rich overview of a native Copilot session log. | -| `inspect ` | Drill into stored trials and status. | +| `list` | List Pier job configs, legacy experiments, and copyable run selectors. | +| `show ` / `show --last` | Print a summary for a Pier run (`job` or `job/run`) or legacy run id. | +| `analyze ` / `analyze --last` / `analyze --file ` | Render a rich overview of a native Copilot session log. | +| `inspect ` | Drill into stored trials and status for a Pier run (`job` or `job/run`) or legacy run id. | | `reindex` | Rebuild the derived SQLite index from `jobs/` and legacy `results/`. | ## Documentation diff --git a/docs/adr/0019-use-nested-pier-run-directories.md b/docs/adr/0019-use-nested-pier-run-directories.md new file mode 100644 index 0000000..1f49d28 --- /dev/null +++ b/docs/adr/0019-use-nested-pier-run-directories.md @@ -0,0 +1,51 @@ +# 0019. Use nested Pier run directories + +- **Status:** Accepted +- **Date:** 2026-06-27 +- **Deciders:** Project maintainers + +## Context + +Pier names each job output directory from `job_name`. Re-running the same experiment with the same +`job_name` would naturally target the same directory, while the previous harness behavior created +the first run at `jobs//` and later reruns at timestamp-suffixed sibling directories such +as `jobs/-20260620-153000/`. + +That mixed stable identity and concrete execution identity in one string. It also made command-line +lookup unclear: users could pass `--last`, but it was not obvious how to discover a run id, how to +select an earlier run, or whether a suffixed directory was a new job or a rerun of the same job. + +The filesystem remains the source of truth, and `results/index.db` remains a derived cache. Existing +flat Pier job directories must remain readable during migration. + +## Decision + +We will store new Pier executions under `jobs///`. + +The configured `job_name` is the stable experiment identity. Each concrete execution gets a +timestamp run id, with numeric collision suffixes when needed. The harness runs Pier by setting +Pier's `jobs_dir` to `jobs/` and Pier's concrete `job_name` to the run id, then writes a +`copilot-experiments-run.json` manifest into the job output so summaries, indexing, and lookup can +recover the stable job name and concrete run id. + +The CLI will expose copyable selectors through `copilot-experiments list`: + +- `job-name/run-id` selects one exact Pier run. +- `job-name` selects the latest run for that Pier job. +- `--last` selects the most recent stored run overall. + +Legacy flat Pier jobs at `jobs//` remain discoverable and resumable. + +## Consequences + +The output tree now separates stable job identity from repeated measurements, so reruns are easier +to compare and explain. `show`, `inspect`, and `analyze` can address exact runs without adding a +parallel command family. + +The harness owns a small manifest file in each new Pier run directory because Pier's native +`config.json` only knows the concrete run id once the job is launched. Discovery must avoid +mistaking legacy flat job trial directories for nested runs; nested child directories under a legacy +flat job are treated as runs only when they contain the harness manifest. + +Older flat jobs remain supported, but new documentation and generated experiment repos should teach +the nested layout and `list`-driven selector workflow. diff --git a/docs/adr/README.md b/docs/adr/README.md index d3b3897..d2d933c 100644 --- a/docs/adr/README.md +++ b/docs/adr/README.md @@ -35,3 +35,4 @@ We follow the lightweight format popularized by | [0016](0016-use-deepswe-for-large-benchmark-protocols.md) | Use DeepSWE for large benchmark protocols | Accepted | | [0017](0017-import-deepswe-as-pier-dataset.md) | Import DeepSWE as a Pier dataset config | Accepted | | [0018](0018-adopt-pytest-cov-for-local-coverage-analysis.md) | Adopt pytest-cov for local coverage analysis | Accepted | +| [0019](0019-use-nested-pier-run-directories.md) | Use nested Pier run directories | Accepted | diff --git a/docs/analysis.md b/docs/analysis.md index 5fe3c34..3e4b366 100644 --- a/docs/analysis.md +++ b/docs/analysis.md @@ -26,9 +26,15 @@ This page covers the second one and the `analyze` command that renders it. # Most recent Pier job (first trial by default) uv run copilot-experiments analyze --last -# A specific Pier job / trial +# Discover copyable selectors +uv run copilot-experiments list + +# A specific Pier job's latest run / trial uv run copilot-experiments analyze tracer-bullet-textstats --trial 1 +# A specific Pier run / trial +uv run copilot-experiments analyze tracer-bullet-textstats/20260620-153000 --trial 1 + # Any events.jsonl on disk — a stored trial log, or a live session under # ~/.copilot/session-state//events.jsonl uv run copilot-experiments analyze --file path/to/events.jsonl diff --git a/docs/architecture.md b/docs/architecture.md index 4a93272..938b656 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -19,7 +19,7 @@ flowchart TD AGENT --> ATIF["/logs/agent/trajectory.json"] OTEL --> ATIF JOB --> VERIFY["Pier verifier\ntests/test.sh -> reward.txt/json"] - JOB --> OUT["jobs///"] + JOB --> OUT["jobs////"] EVENTS --> ANALYSIS["sessionlog.py + analysis.py"] OTEL --> ANALYSIS ATIF --> FALLBACK["ATIF fallback metrics"] @@ -75,7 +75,7 @@ During normalization, `name: copilot-cli` becomes ## Design invariants -1. **Pier jobs are canonical.** `jobs//` is the primary source of truth for new runs. +1. **Pier jobs are canonical.** `jobs///` is the primary source of truth for new runs. 2. **SQLite is derived.** `results/index.db` can be rebuilt from `jobs/` and legacy `results/`. 3. **Copilot logs are primary for Copilot metrics.** ATIF is a fallback and cross-agent view. 4. **Copilot CLI is not reimplemented.** The installed agent shells out to the real CLI. diff --git a/docs/authoring-experiments.md b/docs/authoring-experiments.md index 24b6cb8..3f749f6 100644 --- a/docs/authoring-experiments.md +++ b/docs/authoring-experiments.md @@ -139,6 +139,7 @@ The generated config uses `datasets:` for a corpus and `tasks:` for a single tas ```bash uv run copilot-experiments run --dry-run uv run copilot-experiments run +uv run copilot-experiments list uv run copilot-experiments show --last uv run copilot-experiments analyze --last --trial 1 ``` @@ -152,6 +153,7 @@ export COPILOT_EXPERIMENTS_REPO=/path/to/github-copilot-lab uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments run --dry-run uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments run +uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments list uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments show --last uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments analyze --last --trial 1 ``` @@ -171,8 +173,13 @@ common WSL/Docker Desktop integration issues before a trial can fail without Cop Pier itself resumes existing matching job directories and skips trials that already have `result.json`. `copilot-experiments run` treats a plain rerun as a fresh measurement instead: when -`jobs//` already exists, it appends a timestamp to the Pier job name for the new run. Pass -`--resume` to opt into Pier's native resume behavior for interrupted jobs. +the configured `job_name` is used as a stable grouping directory and each execution gets a +timestamped run id under `jobs///`. Pass `--resume` to reuse the latest existing +run directory for that job and opt into Pier's native skip-completed-trials behavior. + +After a run, `copilot-experiments list` prints copyable selectors. Use `job-name/run-id` to inspect +or analyze an exact Pier execution, `job-name` for that job's latest run, or `--last` for the most +recent stored run across all jobs. ## Legacy Python experiments diff --git a/docs/collecting-run-data.md b/docs/collecting-run-data.md index 43a42a8..c990dc4 100644 --- a/docs/collecting-run-data.md +++ b/docs/collecting-run-data.md @@ -3,7 +3,7 @@ This page is the field guide for collecting the most complete record of what happened during a GitHub Copilot CLI session. The primary artifact is Copilot's native per-session JSONL event stream, `events.jsonl`. In the current Pier-first harness, that log is captured inside each Pier -trial under `jobs///agent/copilot-session//events.jsonl`. +trial under `jobs////agent/copilot-session//events.jsonl`. OpenTelemetry (OTel) export is a complementary live-observability path. ATIF `trajectory.json` is a cross-agent compatibility artifact and fallback, not a replacement for @@ -80,25 +80,27 @@ After Pier downloads trial logs, the canonical experiment-repo layout is: ```text jobs/ / - config.json - result.json - summary.json # derived by copilot-experiments - summary.md # derived by copilot-experiments - / + / config.json result.json - agent/ - copilot-cli.jsonl - copilot-cli.txt - trajectory.json - copilot-otel.jsonl - copilot-session/ - / - events.jsonl - verifier/ - reward.txt - reward.json - artifacts/ + copilot-experiments-run.json + summary.json # derived by copilot-experiments + summary.md # derived by copilot-experiments + / + config.json + result.json + agent/ + copilot-cli.jsonl + copilot-cli.txt + trajectory.json + copilot-otel.jsonl + copilot-session/ + / + events.jsonl + verifier/ + reward.txt + reward.json + artifacts/ ``` `show`, `analyze`, `inspect`, and `reindex` read this tree. For Copilot trials they prefer native diff --git a/docs/deepswe.md b/docs/deepswe.md index 5aa5bcf..5b36fac 100644 --- a/docs/deepswe.md +++ b/docs/deepswe.md @@ -61,6 +61,7 @@ Validate and run it like any other Pier experiment: ```bash uv run copilot-experiments run --dry-run uv run copilot-experiments run deepswe-smoke +uv run copilot-experiments list uv run copilot-experiments show --last uv run copilot-experiments analyze --last --trial 1 ``` diff --git a/docs/results-format.md b/docs/results-format.md index e81e98a..06e533a 100644 --- a/docs/results-format.md +++ b/docs/results-format.md @@ -11,36 +11,40 @@ For a source-by-source explanation of what can be captured around a Copilot CLI ``` jobs/ / - config.json - result.json - summary.json # written by copilot-experiments - summary.md # written by copilot-experiments - / + / config.json result.json - agent/ - copilot-cli.jsonl - copilot-cli.txt - trajectory.json - copilot-otel.jsonl # Copilot OTel file export, when no custom OTLP destination overrides it - copilot-session/ - / - events.jsonl - verifier/ - reward.txt - reward.json - artifacts/ + copilot-experiments-run.json + summary.json # written by copilot-experiments + summary.md # written by copilot-experiments + / + config.json + result.json + agent/ + copilot-cli.jsonl + copilot-cli.txt + trajectory.json + copilot-otel.jsonl # Copilot OTel file export, when no custom OTLP destination overrides it + copilot-session/ + / + events.jsonl + verifier/ + reward.txt + reward.json + artifacts/ ``` Pier owns `config.json`, `result.json`, trial directories, logs, verifier outputs, and artifact -download. `copilot-experiments` derives summaries and indexes from that tree. +download. `copilot-experiments` adds `copilot-experiments-run.json` to preserve the stable +`job_name` plus concrete `run_id`, then derives summaries and indexes from that tree. ## Key files | File | Meaning | | --- | --- | -| `jobs//result.json` | Pier job-level status and stats. | -| `jobs///result.json` | Pier trial status, agent info, verifier result, exceptions, timings. | +| `jobs///result.json` | Pier job-level status and stats for one execution. | +| `jobs///copilot-experiments-run.json` | Stable job name and concrete run id used by summaries, lookup, and indexing. | +| `jobs////result.json` | Pier trial status, agent info, verifier result, exceptions, timings. | | `agent/trajectory.json` | ATIF trajectory emitted by the installed agent. Copilot agent steps include OTel per-LLM-call metrics when `copilot-otel.jsonl` is available; the file is also used as a fallback for non-Copilot agents. | | `agent/copilot-cli.jsonl` / `.txt` | Raw Copilot CLI output streams. Useful for auth or CLI failures. | | `agent/copilot-session/**/events.jsonl` | Native Copilot session log. Primary source for Copilot turns, tool calls, tokens, AIU, and analysis. | @@ -72,9 +76,10 @@ their `results///.../trials//` layout. New Pier tables: ```sql -pier_jobs(job_name PK, job_dir, started_at, finished_at, n_trials, success_rate, status) -pier_trials(id PK, job_name, variant_slug, task_slug, trial_name, success, status, - n_turns, n_tool_calls, total_tokens, aiu, model, error) +pier_jobs(id PK, job_name, run_id, job_dir, started_at, finished_at, n_trials, + success_rate, status) +pier_trials(id PK, job_id, job_name, run_id, variant_slug, task_slug, trial_name, + success, status, n_turns, n_tool_calls, total_tokens, aiu, model, error) ``` Legacy tables (`experiments`, `runs`, `variants`, `tasks`, `trials`) remain for old Python runs. @@ -82,11 +87,17 @@ Legacy tables (`experiments`, `runs`, `variants`, `tasks`, `trials`) remain for ## Analyzing a trial ```bash +uv run copilot-experiments list uv run copilot-experiments analyze --last --trial 1 uv run copilot-experiments analyze --trial 1 -uv run copilot-experiments analyze --file jobs///agent/copilot-session/.../events.jsonl +uv run copilot-experiments analyze / --trial 1 +uv run copilot-experiments analyze --file jobs////agent/copilot-session/.../events.jsonl ``` +`list` is the discovery command for run ids. For Pier outputs, its `selector (job/run)` column is +the exact string accepted by `show`, `inspect`, and `analyze`. Passing only `` selects +that job's latest run; passing `/` selects one concrete execution. + If the selected Pier trial has no native Copilot `events.jsonl`, `analyze` falls back to `agent/trajectory.json` when present; otherwise it reports that no Copilot session log or trajectory is available. When Pier recorded a trial exception before the agent ran, `analyze` diff --git a/src/copilot_experiments/cli.py b/src/copilot_experiments/cli.py index fe858b5..4bd797d 100644 --- a/src/copilot_experiments/cli.py +++ b/src/copilot_experiments/cli.py @@ -5,7 +5,9 @@ import importlib.util import sys from collections.abc import Callable +from dataclasses import dataclass from pathlib import Path +from typing import Literal import typer from rich.console import Console @@ -29,7 +31,9 @@ from .pier_results import ( describe_missing_pier_analysis_source, iter_pier_trial_summaries, + pier_job_label, resolve_pier_trial_analysis_source, + write_pier_run_manifest, write_pier_summary, ) from .render import render_session_analysis @@ -67,6 +71,13 @@ def _force_utf8_streams() -> None: err = Console(stderr=True) +@dataclass(frozen=True) +class ResolvedRun: + kind: Literal["legacy", "pier"] + path: Path + selector: str + + # --------------------------------------------------------------------------- # # Experiment discovery # --------------------------------------------------------------------------- # @@ -318,12 +329,13 @@ def run( if verbose: prepared.config.debug = True inject_copilot_token(prepared.config, auth.token) - console.print(f"[bold]Running Pier job[/bold] {prepared.run_name}") - if prepared.renamed: + console.print(f"[bold]Running Pier job[/bold] {prepared.label}") + if prepared.resumed: + console.print(f"[dim]resume:[/dim] reusing existing Pier run {prepared.label}") + else: console.print( - f"[dim]existing job[/dim] {prepared.requested_name} " - f"[dim]found; writing fresh rerun to[/dim] {prepared.run_name} " - "[dim](use --resume to reuse the existing job)[/dim]" + f"[dim]run:[/dim] writing fresh run to " + f"{Path(prepared.config.jobs_dir) / prepared.run_name}" ) try: run_result = run_pier_job(prepared.config) @@ -331,6 +343,11 @@ def run( err.print(f"[red]Pier job failed:[/red] {type(exc).__name__}: {exc}") any_failures = True continue + write_pier_run_manifest( + run_result.job_dir, + job_name=prepared.requested_name, + run_id=prepared.run_name, + ) summary = write_pier_summary(run_result.job_dir) _print_run_summary(summary) _warn_failed_pier_trials(run_result.job_dir) @@ -407,7 +424,7 @@ def run( def list_cmd( root: Path | None = typer.Option(None, "--root", help="Experiment repository root."), ) -> None: - """List experiments and past runs.""" + """List experiment definitions and concrete run selectors.""" root = Path(root or Path.cwd()) layout = Layout(root) pier_specs = discover_pier_job_configs(root) @@ -437,11 +454,10 @@ def list_cmd( console.print(table) runs = index_list_runs(layout) - if not runs: - console.print("[dim]No runs yet.[/dim]") - else: - table = Table(title="Runs") - table.add_column("run id") + pier_jobs = layout.iter_pier_jobs() + if runs: + table = Table(title="Experiment runs") + table.add_column("selector") table.add_column("experiment") table.add_column("started") table.add_column("trials", justify="right") @@ -457,20 +473,24 @@ def list_cmd( ) console.print(table) - pier_jobs = layout.iter_pier_jobs() if not pier_jobs: + if not runs: + console.print("[dim]No runs yet.[/dim]") return - table = Table(title="Runs") - table.add_column("pier job") + table = Table(title="Pier runs") + table.add_column("selector (job/run)", no_wrap=True) table.add_column("started") table.add_column("trials", justify="right") + table.add_column("success", justify="right") table.add_column("status") for job_dir in pier_jobs: summary = write_pier_summary(job_dir) + sr = summary.get("overall_success_rate") table.add_row( - job_dir.name, + str(summary.get("pier_job_id") or pier_job_label(job_dir)), (summary.get("started_at") or "")[:19], str(summary.get("n_trials") or 0), + "-" if sr is None else f"{sr * 100:.0f}%", str(summary.get("status") or "-"), ) console.print(table) @@ -478,58 +498,59 @@ def list_cmd( @app.command() def show( - run_id: str | None = typer.Argument(None, help="Run id or unique prefix."), - last: bool = typer.Option(False, "--last", help="Show the most recent run."), + selector: str | None = typer.Argument( + None, + help=( + "Run selector from `list`: run id/prefix for legacy runs, Pier job for that " + "job's latest run, or Pier job/run id." + ), + ), + last: bool = typer.Option(False, "--last", help="Show the most recent stored run."), root: Path | None = typer.Option(None, "--root", help="Experiment repository root."), ) -> None: """Print a run summary and per-variant comparison.""" root = Path(root or Path.cwd()) layout = Layout(root) - pier_job = _resolve_pier_job(layout, last=last, run_id=run_id) - run_dir = ( - None - if last and pier_job is not None - else (layout.latest_run() if last else (layout.find_run(run_id) if run_id else None)) - ) - if run_dir is None: - if pier_job is not None: - summary = write_pier_summary(pier_job) - _print_run_summary(summary) - console.print(f"\n[dim]{pier_job / 'summary.md'}[/dim]") - return - if run_dir is None: - err.print("[red]Run not found.[/red] Pass a run id or --last.") + resolved = _resolve_run(layout, last=last, selector=selector) + if resolved is None: + _print_run_not_found(selector) raise typer.Exit(1) - _print_run_summary(read_json(run_dir / "summary.json")) - console.print(f"\n[dim]{run_dir / 'summary.md'}[/dim]") + if resolved.kind == "pier": + summary = write_pier_summary(resolved.path) + _print_run_summary(summary) + console.print(f"\n[dim]{resolved.path / 'summary.md'}[/dim]") + return + _print_run_summary(read_json(resolved.path / "summary.json")) + console.print(f"\n[dim]{resolved.path / 'summary.md'}[/dim]") @app.command() def inspect( - run_id: str | None = typer.Argument(None, help="Run id or unique prefix."), + selector: str | None = typer.Argument( + None, + help=( + "Run selector from `list`: run id/prefix for legacy runs, Pier job for that " + "job's latest run, or Pier job/run id." + ), + ), variant: str | None = typer.Option(None, "--variant", help="Variant slug."), task: str | None = typer.Option(None, "--task", help="Task slug."), trial: int | None = typer.Option(None, "--trial", help="Trial number."), events: int = typer.Option(20, "--events", help="Number of session events to show."), - last: bool = typer.Option(False, "--last", help="Inspect the most recent run."), + last: bool = typer.Option(False, "--last", help="Inspect the most recent stored run."), root: Path | None = typer.Option(None, "--root", help="Experiment repository root."), ) -> None: """Drill into a run's variants, tasks, trials, and session events.""" root = Path(root or Path.cwd()) layout = Layout(root) - pier_job = _resolve_pier_job(layout, last=last, run_id=run_id) - run_dir = ( - None - if last and pier_job is not None - else (layout.latest_run() if last else (layout.find_run(run_id) if run_id else None)) - ) - if run_dir is None: - if pier_job is not None: - _inspect_pier_job(pier_job) - return - if run_dir is None: - err.print("[red]Run not found.[/red] Pass a run id or --last.") + resolved = _resolve_run(layout, last=last, selector=selector) + if resolved is None: + _print_run_not_found(selector) raise typer.Exit(1) + if resolved.kind == "pier": + _inspect_pier_job(resolved.path) + return + run_dir = resolved.path variants_dir = run_dir / "variants" if variant is None: @@ -604,7 +625,13 @@ def inspect( @app.command() def analyze( - run_id: str | None = typer.Argument(None, help="Run id or unique prefix."), + selector: str | None = typer.Argument( + None, + help=( + "Run selector from `list`: run id/prefix for legacy runs, Pier job for that " + "job's latest run, or Pier job/run id." + ), + ), variant: str | None = typer.Option(None, "--variant", help="Variant slug (default: first)."), task: str | None = typer.Option(None, "--task", help="Task slug (default: first)."), trial: int | None = typer.Option(None, "--trial", help="Trial number (default: first)."), @@ -614,7 +641,7 @@ def analyze( otel_file: Path | None = typer.Option( None, "--otel-file", help="Optional Copilot OTel JSONL file to enrich analysis." ), - last: bool = typer.Option(False, "--last", help="Analyze the most recent run."), + last: bool = typer.Option(False, "--last", help="Analyze the most recent stored run."), max_turns: int = typer.Option(0, "--max-turns", help="Limit timeline rows (0 = all)."), root: Path | None = typer.Option(None, "--root", help="Experiment repository root."), ) -> None: @@ -632,37 +659,32 @@ def analyze( root = Path(root or Path.cwd()) layout = Layout(root) - pier_job = _resolve_pier_job(layout, last=last, run_id=run_id) - run_dir = ( - None - if last and pier_job is not None - else (layout.latest_run() if last else (layout.find_run(run_id) if run_id else None)) - ) - if run_dir is None: - if pier_job is not None: - source_path, label, source_kind, discovered_otel = resolve_pier_trial_analysis_source( - pier_job, trial - ) - if source_path is None: - err.print(f"[red]No Copilot session log or trajectory found in[/red] {pier_job}") - diagnostic = describe_missing_pier_analysis_source(pier_job, trial) - if diagnostic: - err.print(f"[yellow]{diagnostic}[/yellow]") - raise typer.Exit(1) - selected_otel = otel_file or discovered_otel - analysis = ( - analyze_events( - load_events(source_path), - load_events(selected_otel) if selected_otel is not None else None, - ) - if source_kind == "events" - else analyze_trajectory(read_json(source_path)) - ) - render_session_analysis(analysis, console, title=label, max_turns=max_turns) - return - if run_dir is None: - err.print("[red]Run not found.[/red] Pass a run id, --last, or --file.") + resolved = _resolve_run(layout, last=last, selector=selector) + if resolved is None: + _print_run_not_found(selector, file_hint=True) raise typer.Exit(1) + if resolved.kind == "pier": + source_path, label, source_kind, discovered_otel = resolve_pier_trial_analysis_source( + resolved.path, trial + ) + if source_path is None: + err.print(f"[red]No Copilot session log or trajectory found in[/red] {resolved.path}") + diagnostic = describe_missing_pier_analysis_source(resolved.path, trial) + if diagnostic: + err.print(f"[yellow]{diagnostic}[/yellow]") + raise typer.Exit(1) + selected_otel = otel_file or discovered_otel + analysis = ( + analyze_events( + load_events(source_path), + load_events(selected_otel) if selected_otel is not None else None, + ) + if source_kind == "events" + else analyze_trajectory(read_json(source_path)) + ) + render_session_analysis(analysis, console, title=label, max_turns=max_turns) + return + run_dir = resolved.path events_path, label, discovered_otel = _resolve_trial_events(run_dir, variant, task, trial) if events_path is None: @@ -740,14 +762,70 @@ def _resolve_trial_events( ) -def _resolve_pier_job(layout: Layout, *, last: bool, run_id: str | None) -> Path | None: +def _resolve_run(layout: Layout, *, last: bool, selector: str | None) -> ResolvedRun | None: if last: - return layout.latest_pier_job() - if run_id: - return layout.find_pier_job(run_id) + return _latest_resolved_run(layout) + if selector is None: + return None + + legacy = layout.find_run(selector) + pier = layout.find_pier_job(selector) + if pier is not None and (legacy is None or "/" in selector): + return ResolvedRun("pier", pier, pier_job_label(pier)) + if legacy is not None: + return ResolvedRun("legacy", legacy, legacy.name) + if pier is not None: + return ResolvedRun("pier", pier, pier_job_label(pier)) return None +def _latest_resolved_run(layout: Layout) -> ResolvedRun | None: + candidates: list[tuple[str, str, ResolvedRun]] = [] + for _experiment_slug, run_id, run_dir in layout.iter_runs(): + candidates.append( + (_legacy_run_started_at(run_dir), run_id, ResolvedRun("legacy", run_dir, run_id)) + ) + for job_dir in layout.iter_pier_jobs(): + selector = pier_job_label(job_dir) + candidates.append( + (_pier_run_started_at(job_dir), selector, ResolvedRun("pier", job_dir, selector)) + ) + if not candidates: + return None + return max(candidates, key=lambda candidate: (candidate[0], candidate[1]))[2] + + +def _legacy_run_started_at(run_dir: Path) -> str: + summary_path = run_dir / "summary.json" + run_path = run_dir / "run.json" + if summary_path.exists(): + return str(read_json(summary_path).get("started_at") or "") + if run_path.exists(): + return str(read_json(run_path).get("started_at") or "") + return "" + + +def _pier_run_started_at(job_dir: Path) -> str: + result_path = job_dir / "result.json" + if result_path.exists(): + return str(read_json(result_path).get("started_at") or "") + return "" + + +def _print_run_not_found(selector: str | None, *, file_hint: bool = False) -> None: + if selector: + err.print(f"[red]Run not found:[/red] {selector!r}") + else: + err.print("[red]Run not found.[/red] Pass a run selector or --last.") + hints = [ + "Use `copilot-experiments list` to copy a selector.", + "Pier selectors look like `job-name/run-id`; `job-name` selects that job's latest run.", + ] + if file_hint: + hints.append("Use `--file path/to/events.jsonl` to analyze a session log directly.") + err.print("[dim]" + " ".join(hints) + "[/dim]") + + def _print_dry_run_report(report: DryRunReport) -> None: table = Table(title=f"Dry-run · {report.experiment}", show_lines=False) table.add_column("", justify="center", width=3) @@ -851,7 +929,7 @@ def _warn_failed_pier_trials(job_dir: Path) -> None: if not problems: return err.print( - f"[yellow]Warning:[/yellow] Pier job [bold]{job_dir.name}[/bold] had " + f"[yellow]Warning:[/yellow] Pier job [bold]{pier_job_label(job_dir)}[/bold] had " f"{len(problems)} harness failure(s). Inspect the captured trial result:" ) for line in problems: @@ -860,11 +938,11 @@ def _warn_failed_pier_trials(job_dir: Path) -> None: def _inspect_pier_job(job_dir: Path) -> None: summary = write_pier_summary(job_dir) - console.print(f"[bold]Pier job[/bold]: {job_dir.name}") + console.print(f"[bold]Pier job[/bold]: {pier_job_label(job_dir)}") console.print(f"[bold]summary[/bold]: {job_dir / 'summary.json'}") _print_run_summary(summary) - table = Table(title=f"Trials in {job_dir.name}") + table = Table(title=f"Trials in {pier_job_label(job_dir)}") table.add_column("trial") table.add_column("status") table.add_column("success") diff --git a/src/copilot_experiments/index.py b/src/copilot_experiments/index.py index 1dfe6d1..16417b2 100644 --- a/src/copilot_experiments/index.py +++ b/src/copilot_experiments/index.py @@ -11,7 +11,7 @@ from pathlib import Path from ._util import read_json -from .pier_results import build_pier_summary, iter_pier_trial_summaries +from .pier_results import build_pier_summary, iter_pier_trial_summaries, pier_job_identity from .storage import Layout SCHEMA = """ @@ -85,7 +85,9 @@ error TEXT ); CREATE TABLE IF NOT EXISTS pier_jobs ( - job_name TEXT PRIMARY KEY, + id TEXT PRIMARY KEY, + job_name TEXT, + run_id TEXT, job_dir TEXT, started_at TEXT, finished_at TEXT, @@ -95,7 +97,9 @@ ); CREATE TABLE IF NOT EXISTS pier_trials ( id INTEGER PRIMARY KEY AUTOINCREMENT, + job_id TEXT, job_name TEXT, + run_id TEXT, variant_slug TEXT, task_slug TEXT, trial_name TEXT, @@ -124,6 +128,18 @@ def _migrate(conn: sqlite3.Connection) -> None: if column not in existing: conn.execute(ddl) + pier_job_columns = {row["name"] for row in conn.execute("PRAGMA table_info(pier_jobs)")} + pier_trial_columns = {row["name"] for row in conn.execute("PRAGMA table_info(pier_trials)")} + if ( + pier_job_columns + and {"id", "run_id"} - pier_job_columns + or pier_trial_columns + and {"job_id", "run_id"} - pier_trial_columns + ): + conn.execute("DROP TABLE IF EXISTS pier_trials") + conn.execute("DROP TABLE IF EXISTS pier_jobs") + conn.executescript(SCHEMA) + def connect(db_path: Path) -> sqlite3.Connection: db_path.parent.mkdir(parents=True, exist_ok=True) @@ -256,15 +272,20 @@ def index_pier_job_dir(conn: sqlite3.Connection, job_dir: Path) -> None: """Insert (or replace) one Pier job into the derived index.""" summary = build_pier_summary(job_dir) - job_name = job_dir.name - conn.execute("DELETE FROM pier_jobs WHERE job_name=?", (job_name,)) - conn.execute("DELETE FROM pier_trials WHERE job_name=?", (job_name,)) + identity = pier_job_identity(job_dir) + job_id = identity["id"] + job_name = identity["job_name"] + run_id = identity["run_id"] + conn.execute("DELETE FROM pier_jobs WHERE id=?", (job_id,)) + conn.execute("DELETE FROM pier_trials WHERE job_id=?", (job_id,)) conn.execute( - "INSERT INTO pier_jobs(job_name, job_dir, started_at, finished_at, n_trials, " - "success_rate, status) VALUES (?,?,?,?,?,?,?)", + "INSERT INTO pier_jobs(id, job_name, run_id, job_dir, started_at, finished_at, " + "n_trials, success_rate, status) VALUES (?,?,?,?,?,?,?,?,?)", ( + job_id, job_name, + run_id, str(job_dir), summary.get("started_at"), summary.get("finished_at"), @@ -277,11 +298,13 @@ def index_pier_job_dir(conn: sqlite3.Connection, job_dir: Path) -> None: for trial in iter_pier_trial_summaries(job_dir): metrics = trial.get("metrics") or {} conn.execute( - "INSERT INTO pier_trials(job_name, variant_slug, task_slug, trial_name, " - "success, status, n_turns, n_tool_calls, total_tokens, aiu, model, error) " - "VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", + "INSERT INTO pier_trials(job_id, job_name, run_id, variant_slug, task_slug, " + "trial_name, success, status, n_turns, n_tool_calls, total_tokens, aiu, model, " + "error) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)", ( + job_id, job_name, + run_id, trial.get("variant"), trial.get("task"), trial.get("trial_name"), diff --git a/src/copilot_experiments/pier_backend.py b/src/copilot_experiments/pier_backend.py index 3d6afa2..672238e 100644 --- a/src/copilot_experiments/pier_backend.py +++ b/src/copilot_experiments/pier_backend.py @@ -14,6 +14,7 @@ import yaml from .pier_agents.copilot_cli import COPILOT_CLI_AGENT_NAME, CopilotCli +from .pier_results import PIER_RUN_MANIFEST COPILOT_CLI_AGENT_IMPORT_PATH = CopilotCli.import_path() @@ -40,16 +41,21 @@ class PierRunResult: @dataclass(frozen=True) class PreparedPierJob: - """A Pier config ready to run, plus any job-name adjustment made for freshness.""" + """A Pier config ready to run, plus the stable job and concrete run identity.""" config: Any requested_name: str run_name: str + resumed: bool = False @property def renamed(self) -> bool: return self.requested_name != self.run_name + @property + def label(self) -> str: + return f"{self.requested_name}/{self.run_name}" + class PierBackendPreflightError(RuntimeError): """A Pier execution backend is not available before a job starts.""" @@ -157,28 +163,31 @@ def prepare_pier_job_for_run( ) -> PreparedPierJob: """Return a run-ready config. - Pier resumes an existing matching ``jobs/`` directory and skips completed trials. - For an experiment harness, a plain ``run`` should create a new measurement instead, while - explicit ``--resume`` should preserve Pier's native behavior. + Pier treats ``jobs_dir / job_name`` as the job directory and resumes any completed + trials found there. The harness keeps the configured ``job_name`` as the stable + experiment identity, but points Pier at ``jobs//`` so every + execution has a uniform run directory. Explicit ``--resume`` reuses the latest + known run for that stable job when one exists. """ prepared = config.model_copy(deep=True) requested_name = str(prepared.job_name) if resume: - return PreparedPierJob(prepared, requested_name, requested_name) - - requested_dir = _job_dir(prepared) - if not requested_dir.exists(): - return PreparedPierJob(prepared, requested_name, requested_name) - - stamp = (now or datetime.now()).strftime("%Y%m%d-%H%M%S") - base = f"{requested_name}-{stamp}" - run_name = base + existing = _latest_existing_run_dir(prepared) + if existing is not None: + prepared.jobs_dir = existing.parent + prepared.job_name = existing.name + return PreparedPierJob(prepared, requested_name, existing.name, resumed=True) + + base_run_name = (now or datetime.now()).strftime("%Y%m%d-%H%M%S") + run_name = base_run_name + job_group_dir = Path(prepared.jobs_dir) / requested_name index = 2 - while (Path(prepared.jobs_dir) / run_name).exists(): - run_name = f"{base}-{index}" + while (job_group_dir / run_name).exists(): + run_name = f"{base_run_name}-{index}" index += 1 + prepared.jobs_dir = job_group_dir prepared.job_name = run_name return PreparedPierJob(prepared, requested_name, run_name) @@ -205,6 +214,32 @@ def _job_dir(config: Any) -> Path: return Path(config.jobs_dir) / str(config.job_name) +def _latest_existing_run_dir(config: Any) -> Path | None: + """Return the latest resumable run directory for a stable job config. + + New runs live at ``jobs//``. A pre-migration flat + ``jobs/`` directory may also exist, so keep it resumable when no + nested run has been created yet. + """ + + flat_dir = _job_dir(config) + nested_root = flat_dir + nested = [] + if nested_root.is_dir(): + nested = sorted( + path + for path in nested_root.iterdir() + if path.is_dir() + and (path / "config.json").exists() + and (path / PIER_RUN_MANIFEST).exists() + ) + if nested: + return nested[-1] + if flat_dir.is_dir() and (flat_dir / "config.json").exists(): + return flat_dir + return None + + def _environment_type(config: Any) -> str: environment = getattr(config, "environment", None) value = getattr(environment, "type", None) diff --git a/src/copilot_experiments/pier_results.py b/src/copilot_experiments/pier_results.py index 6ddca31..2dd6be3 100644 --- a/src/copilot_experiments/pier_results.py +++ b/src/copilot_experiments/pier_results.py @@ -14,6 +14,7 @@ from .sessionlog import load_events, parse_metrics AnalysisSource = Literal["events", "trajectory"] +PIER_RUN_MANIFEST = "copilot-experiments-run.json" def iter_trial_dirs(job_dir: Path) -> list[Path]: @@ -39,6 +40,7 @@ def build_pier_summary(job_dir: Path) -> dict[str, Any]: job_dir = Path(job_dir) job_result = read_json(job_dir / "result.json") job_config = read_json(job_dir / "config.json") if (job_dir / "config.json").exists() else {} + identity = pier_job_identity(job_dir, job_config) variant_cells: dict[str, dict[str, Any]] = {} for row in iter_pier_trial_summaries(job_dir): @@ -78,9 +80,10 @@ def build_pier_summary(job_dir: Path) -> dict[str, Any]: total_aiu = sum((trial.get("metrics") or {}).get("aiu") or 0 for trial in all_trials) summary = { - "run_id": job_dir.name, - "experiment": job_config.get("job_name") or job_dir.name, - "experiment_slug": job_dir.name, + "run_id": identity["run_id"], + "experiment": identity["job_name"], + "experiment_slug": identity["job_name"], + "pier_job_id": identity["id"], "started_at": job_result.get("started_at"), "finished_at": job_result.get("finished_at"), "status": _job_status(job_result), @@ -110,14 +113,55 @@ def write_pier_summary(job_dir: Path) -> dict[str, Any]: return summary +def write_pier_run_manifest(job_dir: Path, *, job_name: str, run_id: str) -> None: + """Persist the stable job identity beside Pier's run artifacts.""" + + write_json( + Path(job_dir) / PIER_RUN_MANIFEST, + { + "schema_version": 1, + "job_name": job_name, + "run_id": run_id, + "id": f"{job_name}/{run_id}", + }, + ) + + +def pier_job_identity(job_dir: Path, job_config: dict[str, Any] | None = None) -> dict[str, str]: + """Return stable job identity and concrete run id for a Pier output directory.""" + + job_dir = Path(job_dir) + manifest_path = job_dir / PIER_RUN_MANIFEST + if manifest_path.exists(): + manifest = read_json(manifest_path) + job_name = str(manifest.get("job_name") or job_dir.parent.name) + run_id = str(manifest.get("run_id") or job_dir.name) + return {"job_name": job_name, "run_id": run_id, "id": f"{job_name}/{run_id}"} + + if job_dir.parent.parent.name == "jobs": + job_name = job_dir.parent.name + run_id = job_dir.name + return {"job_name": job_name, "run_id": run_id, "id": f"{job_name}/{run_id}"} + + config = job_config or ( + read_json(job_dir / "config.json") if (job_dir / "config.json").exists() else {} + ) + job_name = str(config.get("job_name") or job_dir.name) + return {"job_name": job_name, "run_id": job_dir.name, "id": job_dir.name} + + +def pier_job_label(job_dir: Path) -> str: + return pier_job_identity(job_dir)["id"] + + def resolve_pier_trial_events( job_dir: Path, trial: int | str | None = None ) -> tuple[Path | None, str]: trial_dir = _resolve_trial_dir(job_dir, trial) if trial_dir is None: - return None, Path(job_dir).name + return None, pier_job_label(job_dir) events = find_copilot_session_events(trial_dir / "agent") - return events, f"{Path(job_dir).name} · {trial_dir.name}" + return events, f"{pier_job_label(job_dir)} · {trial_dir.name}" def resolve_pier_trial_analysis_source( @@ -125,9 +169,9 @@ def resolve_pier_trial_analysis_source( ) -> tuple[Path | None, str, AnalysisSource | None, Path | None]: trial_dir = _resolve_trial_dir(job_dir, trial) if trial_dir is None: - return None, Path(job_dir).name, None, None + return None, pier_job_label(job_dir), None, None - label = f"{Path(job_dir).name} · {trial_dir.name}" + label = f"{pier_job_label(job_dir)} · {trial_dir.name}" agent_dir = trial_dir / "agent" events = find_copilot_session_events(agent_dir) if events is not None: diff --git a/src/copilot_experiments/storage.py b/src/copilot_experiments/storage.py index db8799d..9b95785 100644 --- a/src/copilot_experiments/storage.py +++ b/src/copilot_experiments/storage.py @@ -8,18 +8,19 @@ jobs/ / - config.json - result.json - / + / config.json result.json - agent/ - trajectory.json - copilot-cli.jsonl - copilot-otel.jsonl - copilot-session/**/events.jsonl - verifier/ - artifacts/ + / + config.json + result.json + agent/ + trajectory.json + copilot-cli.jsonl + copilot-otel.jsonl + copilot-session/**/events.jsonl + verifier/ + artifacts/ Legacy layout (inside an experiment repository):: @@ -55,6 +56,8 @@ from pathlib import Path +from .pier_results import PIER_RUN_MANIFEST + class Layout: """Resolves the standard result paths for an experiment repository. @@ -139,30 +142,68 @@ def latest_run(self) -> Path | None: # --- Pier discovery helpers ------------------------------------------- # def iter_pier_jobs(self) -> list[Path]: - """Yield Pier job directories under ``jobs/``. + """Yield Pier run directories under ``jobs/``. - A Pier job directory is identified by the stable pair ``config.json`` and - ``result.json``. The SQLite index remains under ``results/`` because it is - a derived cache owned by this project, not by Pier. + New runs live at ``jobs///``. Pre-migration flat + ``jobs//`` directories are still recognized for existing data. + A Pier run directory is identified by the stable pair ``config.json`` and + ``result.json``. The SQLite index remains under ``results/`` because it + is a derived cache owned by this project, not by Pier. """ if not self.jobs_dir.exists(): return [] - return sorted( - path - for path in self.jobs_dir.iterdir() - if path.is_dir() and (path / "config.json").exists() and (path / "result.json").exists() - ) + found: list[Path] = [] + for path in sorted(p for p in self.jobs_dir.iterdir() if p.is_dir()): + is_flat_job = self._is_pier_job_dir(path) + if is_flat_job: + found.append(path) + found.extend( + child + for child in sorted(p for p in path.iterdir() if p.is_dir()) + if self._is_pier_job_dir(child) + and (not is_flat_job or (child / PIER_RUN_MANIFEST).exists()) + ) + return sorted(found, key=self._pier_job_sort_key) def find_pier_job(self, job_name: str) -> Path | None: - """Locate a Pier job by exact name or unique prefix.""" + """Locate a Pier run by job name, run id, ``job/run`` id, or unique prefix.""" - matches = [path for path in self.iter_pier_jobs() if path.name == job_name] - if matches: + jobs = self.iter_pier_jobs() + group = self.jobs_dir / job_name + group_runs = [path for path in jobs if path.parent == group] + if group_runs: + return group_runs[-1] + + matches = [ + path for path in jobs if path.name == job_name or self.pier_job_key(path) == job_name + ] + if len(matches) == 1: return matches[0] - prefix = [path for path in self.iter_pier_jobs() if path.name.startswith(job_name)] + prefix = [ + path + for path in jobs + if path.name.startswith(job_name) or self.pier_job_key(path).startswith(job_name) + ] return prefix[0] if len(prefix) == 1 else None def latest_pier_job(self) -> Path | None: jobs = self.iter_pier_jobs() return jobs[-1] if jobs else None + + def pier_job_key(self, job_dir: Path) -> str: + """Return ``job/run`` for nested runs and the directory name for legacy flat jobs.""" + + job_dir = Path(job_dir) + if job_dir.parent.parent == self.jobs_dir: + return f"{job_dir.parent.name}/{job_dir.name}" + return job_dir.name + + @staticmethod + def _is_pier_job_dir(path: Path) -> bool: + return (path / "config.json").exists() and (path / "result.json").exists() + + def _pier_job_sort_key(self, path: Path) -> tuple[int, str, str]: + if path.parent.parent == self.jobs_dir: + return (1, path.name, path.parent.name) + return (0, path.name, path.name) diff --git a/src/copilot_experiments/templates/experiment_repo/.apm/skills/analyzing-results/SKILL.md b/src/copilot_experiments/templates/experiment_repo/.apm/skills/analyzing-results/SKILL.md index 58e7ce7..171c241 100644 --- a/src/copilot_experiments/templates/experiment_repo/.apm/skills/analyzing-results/SKILL.md +++ b/src/copilot_experiments/templates/experiment_repo/.apm/skills/analyzing-results/SKILL.md @@ -10,9 +10,10 @@ description: >- ## Filesystem layout ``` -jobs// +jobs/// config.json # resolved Pier job config result.json # Pier job result + copilot-experiments-run.json summary.json # derived copilot-experiments summary summary.md # human-readable report / @@ -31,7 +32,8 @@ tokens, and AIU economics. ```bash copilot-experiments list # runs + success rates copilot-experiments show --last # per-variant comparison table -copilot-experiments inspect # list Pier trials +copilot-experiments inspect # latest run for that Pier job +copilot-experiments inspect / # exact run selector from list copilot-experiments analyze --last # render native Copilot events copilot-experiments reindex # rebuild results/index.db ``` diff --git a/src/copilot_experiments/templates/experiment_repo/AGENTS.md.tmpl b/src/copilot_experiments/templates/experiment_repo/AGENTS.md.tmpl index 09eefa8..87db7cc 100644 --- a/src/copilot_experiments/templates/experiment_repo/AGENTS.md.tmpl +++ b/src/copilot_experiments/templates/experiment_repo/AGENTS.md.tmpl @@ -37,6 +37,6 @@ uv sync uv run copilot-experiments run [--dry-run] uv run copilot-experiments list uv run copilot-experiments show --last -uv run copilot-experiments inspect --trial -uv run copilot-experiments analyze --trial +uv run copilot-experiments inspect / --trial +uv run copilot-experiments analyze / --trial ``` diff --git a/src/copilot_experiments/templates/experiment_repo/README.md.tmpl b/src/copilot_experiments/templates/experiment_repo/README.md.tmpl index 5c3d9ef..424cc7a 100644 --- a/src/copilot_experiments/templates/experiment_repo/README.md.tmpl +++ b/src/copilot_experiments/templates/experiment_repo/README.md.tmpl @@ -8,7 +8,7 @@ GitHub Copilot research experiments, powered by ``` experiments/ # Pier JobConfig YAML files tasks/ # Harbor/Pier task directories -jobs/ # Pier job outputs (gitignored) +jobs/ # Pier job/run outputs (gitignored) results/ # derived SQLite index for queries (gitignored) .apm/ # APM agent context (instructions, skills, prompts) ``` @@ -27,12 +27,13 @@ uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments run --dry-run # run for real through Pier (requires Copilot auth and a supported Pier backend) uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments run +uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments list uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments show --last # explore results uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments list -uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments inspect --trial 1 -uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments analyze --trial 1 +uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments inspect / --trial 1 +uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments analyze / --trial 1 ``` In PowerShell, use @@ -55,12 +56,13 @@ uv run copilot-experiments run --dry-run # run for real through Pier (requires Copilot auth and a supported Pier backend) uv run copilot-experiments run +uv run copilot-experiments list uv run copilot-experiments show --last # explore results uv run copilot-experiments list -uv run copilot-experiments inspect --trial 1 -uv run copilot-experiments analyze --trial 1 +uv run copilot-experiments inspect / --trial 1 +uv run copilot-experiments analyze / --trial 1 ``` ## Writing experiments @@ -73,6 +75,11 @@ attempts, concurrency, and artifacts. The bundled `copilot-cli` agent runs the r Copilot CLI inside the Pier environment and captures both ATIF and native Copilot `events.jsonl` logs. +Runs are written under `jobs///`, where `` comes from the stable +`job_name` in `experiments/*.yaml` and `` is a timestamp for one execution. +Use `copilot-experiments list` to copy selectors. `` selects that job's latest run; +`/` selects one exact run. + ## Agent context (APM) This repo uses [APM](https://github.com/microsoft/apm) to manage Copilot context. diff --git a/tests/test_pier_backend.py b/tests/test_pier_backend.py index 23ad902..4623ddc 100644 --- a/tests/test_pier_backend.py +++ b/tests/test_pier_backend.py @@ -81,24 +81,28 @@ def test_inject_copilot_token_only_updates_copilot_agents(tmp_path: Path): assert config.agents[1].env == {} -def test_prepare_pier_job_for_run_keeps_first_run_name(tmp_path: Path): +def test_prepare_pier_job_for_run_creates_timestamped_run_under_job_group(tmp_path: Path): config_path = tmp_path / "job.yaml" config_path.write_text("job_name: smoke\njobs_dir: jobs\n", encoding="utf-8") config = load_pier_job_config(config_path, root=tmp_path) - prepared = prepare_pier_job_for_run(config) + prepared = prepare_pier_job_for_run(config, now=datetime(2026, 6, 20, 15, 30, 0)) assert prepared.requested_name == "smoke" - assert prepared.run_name == "smoke" - assert not prepared.renamed + assert prepared.run_name == "20260620-153000" + assert prepared.label == "smoke/20260620-153000" + assert prepared.config.jobs_dir == tmp_path / "jobs" / "smoke" + assert prepared.config.job_name == "20260620-153000" + assert prepared.renamed + assert not prepared.resumed assert config.job_name == "smoke" -def test_prepare_pier_job_for_run_uses_fresh_name_when_job_exists(tmp_path: Path): +def test_prepare_pier_job_for_run_uses_collision_suffix_when_run_exists(tmp_path: Path): config_path = tmp_path / "job.yaml" config_path.write_text("job_name: smoke\njobs_dir: jobs\n", encoding="utf-8") config = load_pier_job_config(config_path, root=tmp_path) - (tmp_path / "jobs" / "smoke").mkdir(parents=True) + (tmp_path / "jobs" / "smoke" / "20260620-153000").mkdir(parents=True) prepared = prepare_pier_job_for_run( config, @@ -106,22 +110,49 @@ def test_prepare_pier_job_for_run_uses_fresh_name_when_job_exists(tmp_path: Path ) assert prepared.requested_name == "smoke" - assert prepared.run_name == "smoke-20260620-153000" - assert prepared.config.job_name == "smoke-20260620-153000" + assert prepared.run_name == "20260620-153000-2" + assert prepared.config.jobs_dir == tmp_path / "jobs" / "smoke" + assert prepared.config.job_name == "20260620-153000-2" assert prepared.renamed assert config.job_name == "smoke" -def test_prepare_pier_job_for_run_resume_keeps_existing_name(tmp_path: Path): +def test_prepare_pier_job_for_run_resume_uses_latest_nested_run(tmp_path: Path): + config_path = tmp_path / "job.yaml" + config_path.write_text("job_name: smoke\njobs_dir: jobs\n", encoding="utf-8") + config = load_pier_job_config(config_path, root=tmp_path) + old_run = tmp_path / "jobs" / "smoke" / "20260620-153000" + latest_run = tmp_path / "jobs" / "smoke" / "20260620-160000" + old_run.mkdir(parents=True) + latest_run.mkdir() + (old_run / "config.json").write_text("{}", encoding="utf-8") + (old_run / "copilot-experiments-run.json").write_text("{}", encoding="utf-8") + (latest_run / "config.json").write_text("{}", encoding="utf-8") + (latest_run / "copilot-experiments-run.json").write_text("{}", encoding="utf-8") + + prepared = prepare_pier_job_for_run(config, resume=True) + + assert prepared.requested_name == "smoke" + assert prepared.run_name == "20260620-160000" + assert prepared.config.jobs_dir == tmp_path / "jobs" / "smoke" + assert prepared.config.job_name == "20260620-160000" + assert prepared.resumed + + +def test_prepare_pier_job_for_run_resume_supports_legacy_flat_job(tmp_path: Path): config_path = tmp_path / "job.yaml" config_path.write_text("job_name: smoke\njobs_dir: jobs\n", encoding="utf-8") config = load_pier_job_config(config_path, root=tmp_path) - (tmp_path / "jobs" / "smoke").mkdir(parents=True) + legacy_job = tmp_path / "jobs" / "smoke" + legacy_job.mkdir(parents=True) + (legacy_job / "config.json").write_text("{}", encoding="utf-8") prepared = prepare_pier_job_for_run(config, resume=True) assert prepared.run_name == "smoke" - assert not prepared.renamed + assert prepared.config.jobs_dir == tmp_path / "jobs" + assert prepared.config.job_name == "smoke" + assert prepared.resumed def test_preflight_pier_backend_reports_missing_docker( diff --git a/tests/test_pier_results.py b/tests/test_pier_results.py index d08b46e..bd30cdb 100644 --- a/tests/test_pier_results.py +++ b/tests/test_pier_results.py @@ -12,7 +12,9 @@ from copilot_experiments.pier_results import ( build_pier_summary, describe_missing_pier_analysis_source, + pier_job_identity, resolve_pier_trial_events, + write_pier_run_manifest, write_pier_summary, ) @@ -216,6 +218,23 @@ def test_build_pier_summary_reads_native_copilot_events(tmp_path: Path): assert variant["tasks"][0]["task"] == "textstats" +def test_build_pier_summary_reads_nested_run_identity(tmp_path: Path): + job_dir = _make_pier_job(tmp_path / "jobs" / "demo-job" / "20260620-153000") + write_pier_run_manifest(job_dir, job_name="demo-job", run_id="20260620-153000") + + summary = build_pier_summary(job_dir) + + assert summary["experiment"] == "demo-job" + assert summary["experiment_slug"] == "demo-job" + assert summary["run_id"] == "20260620-153000" + assert summary["pier_job_id"] == "demo-job/20260620-153000" + assert pier_job_identity(job_dir) == { + "job_name": "demo-job", + "run_id": "20260620-153000", + "id": "demo-job/20260620-153000", + } + + def test_resolve_pier_trial_events(tmp_path: Path): job_dir = _make_pier_job(tmp_path / "jobs" / "demo-job") @@ -294,8 +313,52 @@ def test_cli_analyze_reports_pier_harness_error_when_logs_are_absent(tmp_path: P assert "unavailable" in result.output +def test_cli_list_displays_pier_run_selectors(tmp_path: Path): + job_dir = _make_pier_job(tmp_path / "jobs" / "demo-job" / "20260620-153000") + write_pier_run_manifest(job_dir, job_name="demo-job", run_id="20260620-153000") + runner = CliRunner() + + result = runner.invoke(app, ["list", "--root", str(tmp_path)]) + + assert result.exit_code == 0, result.output + assert "Pier runs" in result.output + assert "selector" in result.output + assert "demo-job/20260620-153000" in result.output + assert "demo-job" in result.output + assert "20260620-153000" in result.output + assert "No runs yet" not in result.output + + +def test_cli_show_accepts_pier_job_run_selector(tmp_path: Path): + job_dir = _make_pier_job(tmp_path / "jobs" / "demo-job" / "20260620-153000") + write_pier_run_manifest(job_dir, job_name="demo-job", run_id="20260620-153000") + runner = CliRunner() + + result = runner.invoke( + app, + ["show", "demo-job/20260620-153000", "--root", str(tmp_path)], + ) + + assert result.exit_code == 0, result.output + assert "demo-job" in result.output + assert "20260620-153000" in result.output + assert "summary.md" in result.output + + +def test_cli_show_missing_run_points_to_list(tmp_path: Path): + runner = CliRunner() + + result = runner.invoke(app, ["show", "missing", "--root", str(tmp_path)]) + + assert result.exit_code == 1 + assert "Run not found" in result.output + assert "copilot-experiments list" in result.output + assert "job-name/run-id" in result.output + + def test_write_pier_summary_and_index(tmp_path: Path): - job_dir = _make_pier_job(tmp_path / "jobs" / "demo-job") + job_dir = _make_pier_job(tmp_path / "jobs" / "demo-job" / "20260620-153000") + write_pier_run_manifest(job_dir, job_name="demo-job", run_id="20260620-153000") summary = write_pier_summary(job_dir) @@ -306,12 +369,18 @@ def test_write_pier_summary_and_index(tmp_path: Path): conn = connect(tmp_path / "results" / "index.db") try: index_pier_job_dir(conn, job_dir) - job = conn.execute("SELECT * FROM pier_jobs WHERE job_name='demo-job'").fetchone() - trial = conn.execute("SELECT * FROM pier_trials WHERE job_name='demo-job'").fetchone() + job = conn.execute("SELECT * FROM pier_jobs WHERE id='demo-job/20260620-153000'").fetchone() + trial = conn.execute( + "SELECT * FROM pier_trials WHERE job_id='demo-job/20260620-153000'" + ).fetchone() finally: conn.close() + assert job["job_name"] == "demo-job" + assert job["run_id"] == "20260620-153000" assert job["success_rate"] == 1.0 + assert trial["job_name"] == "demo-job" + assert trial["run_id"] == "20260620-153000" assert trial["trial_name"] == "copilot-cli__textstats__1" assert trial["success"] == 1 assert trial["total_tokens"] == 15.0 diff --git a/tests/test_storage.py b/tests/test_storage.py index 62f12bc..3cb3c6a 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -50,16 +50,32 @@ def test_iter_runs_skips_incomplete(tmp_path: Path): def test_pier_job_helpers(tmp_path: Path): jobs = tmp_path / "jobs" - good = jobs / "20260102T000000Z_beta" + good = jobs / "smoke" / "20260102-000000" good.mkdir(parents=True) (good / "config.json").write_text("{}", encoding="utf-8") (good / "result.json").write_text("{}", encoding="utf-8") - incomplete = jobs / "20260103T000000Z_incomplete" + latest = jobs / "smoke" / "20260103-000000" + latest.mkdir() + (latest / "config.json").write_text("{}", encoding="utf-8") + (latest / "result.json").write_text("{}", encoding="utf-8") + incomplete = jobs / "smoke" / "20260104-000000" incomplete.mkdir() + legacy = jobs / "legacy-job" + legacy.mkdir() + (legacy / "config.json").write_text("{}", encoding="utf-8") + (legacy / "result.json").write_text("{}", encoding="utf-8") + legacy_trial = legacy / "copilot-cli__task__1" + legacy_trial.mkdir() + (legacy_trial / "config.json").write_text("{}", encoding="utf-8") + (legacy_trial / "result.json").write_text("{}", encoding="utf-8") layout = Layout(tmp_path) - assert layout.iter_pier_jobs() == [good] - assert layout.latest_pier_job() == good + assert layout.iter_pier_jobs() == [legacy, good, latest] + assert layout.pier_job_key(good) == "smoke/20260102-000000" + assert layout.latest_pier_job() == latest + assert layout.find_pier_job("smoke") == latest + assert layout.find_pier_job("smoke/20260102") == good assert layout.find_pier_job("20260102") == good + assert layout.find_pier_job("legacy-job") == legacy assert layout.find_pier_job("missing") is None From 50e47d9e380904f0f5b5404300f8b5e64a8b72a9 Mon Sep 17 00:00:00 2001 From: Dominique Broeglin Date: Sun, 28 Jun 2026 03:46:27 +0200 Subject: [PATCH 2/3] Simplify CLI around Pier jobs Remove the legacy native runner, dry-run command, SQLite index, and legacy result layout. Standardize storage, summaries, docs, templates, and the structure canvas around Pier jobs, agents, tasks, trials, and runs. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .apm/instructions/development.instructions.md | 15 +- .apm/prompts/library-change.prompt.md | 6 +- .apm/skills/developing-the-library/SKILL.md | 37 +- .../extension.mjs | 61 +- AGENTS.md | 46 +- README.md | 24 +- docs/adr/0003-sqlite-derived-index.md | 2 +- .../0019-use-nested-pier-run-directories.md | 10 +- docs/adr/0020-remove-legacy-native-harness.md | 44 + docs/adr/README.md | 3 +- docs/analysis.md | 17 +- docs/architecture.md | 26 +- docs/authoring-experiments.md | 24 +- docs/collecting-run-data.md | 21 +- docs/deepswe.md | 5 +- docs/results-format.md | 47 +- examples/task_suite/README.md | 11 +- examples/tracer_bullet/README.md | 14 +- sandbox/README.md | 5 +- src/copilot_experiments/__init__.py | 49 +- src/copilot_experiments/_util.py | 2 +- src/copilot_experiments/auth.py | 36 +- src/copilot_experiments/cli.py | 1052 ++++++----------- src/copilot_experiments/index.py | 356 ------ src/copilot_experiments/invoker.py | 482 -------- src/copilot_experiments/models.py | 358 +----- src/copilot_experiments/pier_backend.py | 23 +- src/copilot_experiments/pier_results.py | 60 +- src/copilot_experiments/report.py | 246 +--- src/copilot_experiments/runner.py | 591 --------- src/copilot_experiments/storage.py | 184 +-- .../instructions/experiments.instructions.md | 3 +- .../.apm/prompts/new-experiment.prompt.md | 2 +- .../.apm/skills/analyzing-results/SKILL.md | 17 +- .../skills/authoring-experiments/SKILL.md | 2 +- .../templates/experiment_repo/AGENTS.md.tmpl | 12 +- .../templates/experiment_repo/README.md.tmpl | 17 +- src/copilot_experiments/workspace.py | 116 -- tests/conftest.py | 68 -- tests/test_auth.py | 25 +- tests/test_economics.py | 71 +- tests/test_index.py | 61 - tests/test_invoker.py | 216 ---- tests/test_models.py | 60 - tests/test_pier_backend.py | 19 +- tests/test_pier_results.py | 142 ++- tests/test_runner.py | 301 ----- tests/test_storage.py | 86 +- tests/test_workspace.py | 151 --- 49 files changed, 864 insertions(+), 4362 deletions(-) create mode 100644 docs/adr/0020-remove-legacy-native-harness.md delete mode 100644 src/copilot_experiments/index.py delete mode 100644 src/copilot_experiments/invoker.py delete mode 100644 src/copilot_experiments/runner.py delete mode 100644 src/copilot_experiments/workspace.py delete mode 100644 tests/conftest.py delete mode 100644 tests/test_index.py delete mode 100644 tests/test_invoker.py delete mode 100644 tests/test_models.py delete mode 100644 tests/test_runner.py delete mode 100644 tests/test_workspace.py diff --git a/.apm/instructions/development.instructions.md b/.apm/instructions/development.instructions.md index c5f5f25..1e8bab8 100644 --- a/.apm/instructions/development.instructions.md +++ b/.apm/instructions/development.instructions.md @@ -16,11 +16,10 @@ experiment repo — experiment-authoring context is a template under formatting, and CI/pre-commit enforce it. - Maintain good test coverage for every behavior change with focused offline tests, not just broad smoke coverage. -- Keep tests offline: exercise the runner with `MockInvoker` (and a `solver` for the success - path) plus a temp `--root`. Never invoke the real `copilot` binary or the network in tests. -- Preserve invariants: filesystem is source of truth (`reindex` rebuilds `index.db`); secrets are - redacted on disk (`Variant.stored()` / `ProviderConfig.redacted()`); `--dry-run` is ephemeral — - it runs in a temp dir, validates each stage, and persists nothing (`dry_run_experiment`). +- Keep tests offline: use Pier config/job-output fixtures and mocks plus a temp `--root`. Never + invoke the real `copilot` binary, Docker, or the network in tests. +- Preserve invariants: `jobs///` is the filesystem source of truth; summaries are + derived; secrets are injected at run time and redacted from persisted configs. ## When changing public behavior - Update `docs/` (architecture, authoring, results-format, BYOK) and `README.md`. @@ -28,6 +27,6 @@ experiment repo — experiment-authoring context is a template under - Bump `__version__` in `src/copilot_experiments/__init__.py` and `version` in `pyproject.toml`. ## Module responsibilities -`models` (schemas) · `invoker` (build/run copilot) · `workspace` (provision + diff) · -`sessionlog` (parse events → metrics) · `runner` (orchestrate) · `storage` (layout) · -`index` (sqlite) · `report` (summaries) · `scaffold` (init) · `cli` (Typer). +`models` (analysis/economics schemas) · `pier_backend` (Pier config/run integration) · +`pier_results` (job/run/agent/task summaries) · `sessionlog` (parse events → metrics) · +`storage` (Pier jobs layout) · `report` (summaries) · `scaffold` (init) · `cli` (Typer). diff --git a/.apm/prompts/library-change.prompt.md b/.apm/prompts/library-change.prompt.md index b2f7594..3183b16 100644 --- a/.apm/prompts/library-change.prompt.md +++ b/.apm/prompts/library-change.prompt.md @@ -9,9 +9,9 @@ Make a change to the `copilot_experiments` package (the harness, not an experime Steps: 1. Identify the right module (see `AGENTS.md` repository map and the `developing-the-library` skill). -2. Implement the change, keeping the architecture invariants intact (filesystem is source of - truth; secrets redacted on disk; tests/dry-runs stay offline). -3. Add or update tests in `tests/` using `MockInvoker` and a temp `--root`. +2. Implement the change, keeping the architecture invariants intact (`jobs///` is the + filesystem source of truth; secrets are redacted on disk; tests stay offline). +3. Add or update tests in `tests/` using fixtures/mocks and a temp `--root`. 4. Run `uv run ruff check --fix .`, `uv run ruff format .`, `uv run ruff check .`, and `uv run pytest -q`; fix until all are green. 5. Update `docs/`, `README.md`, and the `templates/experiment_repo/` template if public diff --git a/.apm/skills/developing-the-library/SKILL.md b/.apm/skills/developing-the-library/SKILL.md index 2241591..53f9cbf 100644 --- a/.apm/skills/developing-the-library/SKILL.md +++ b/.apm/skills/developing-the-library/SKILL.md @@ -2,39 +2,38 @@ name: developing-the-library description: >- Use when modifying the copilot-experiments library or CLI itself — adding or - changing modules (models, invoker, runner, sessionlog, storage, index, report, - scaffold, cli), writing tests, or updating the scaffolded experiment-repo - template. Not for authoring experiments. + changing modules (models, pier_backend, pier_results, sessionlog, storage, + report, scaffold, cli), writing tests, or updating the scaffolded + experiment-repo template. Not for authoring experiments. --- # Developing the copilot-experiments library ## Mental model -A **run** executes an `Experiment` (a `Task` + a list of `Variant`s). For each variant, for each -trial, the runner: provisions a workspace → invokes Copilot → copies & parses the session log → -captures a workspace diff → runs `verify` → writes artifacts → updates the SQLite index. +A **run** executes a Pier `JobConfig`. For each agent/task/attempt trial, Pier provisions the +environment, invokes the installed agent, runs the verifier, and downloads logs/artifacts. +`copilot-experiments` contributes the `copilot-cli` Pier agent and derives summaries/analysis from +the resulting `jobs///` tree. ``` -Experiment ─┬─ Task (prompt, fixture/repo, setup, verify) - └─ Variant[] (model, effort, agent, mode, provider/BYOK, env, trials) -run_experiment() → results/// + results/index.db +Pier JobConfig ─┬─ tasks/datasets + └─ agents[] (copilot-cli model, effort, mode, kwargs) +copilot-experiments run → jobs/// ``` ## Where to make a change -- New experiment-definition field → `models.py` (+ thread through `invoker.build_args`/`build_env` - if it affects the command, + `index.py` columns if you want it queryable). +- New Pier config/run behavior → `pier_backend.py`. - New CLI command/flag → `cli.py` (Typer). `B008` is ignored project-wide for Typer defaults. -- New metric → `sessionlog.parse_metrics` (+ `Metrics` in `models.py`, + `index.py`, + `report.py`). -- New result artifact → write it in `runner._run_trial`, document it in `storage.py`'s docstring - and `docs/results-format.md`. +- New metric → `sessionlog.parse_metrics` (+ `Metrics` in `models.py`, + `pier_results.py` / + `report.py` if summaries should expose it). +- New result artifact → emit or collect it through the Pier agent/backend, then document it in + `docs/results-format.md`. - Experiment-authoring change → edit `templates/experiment_repo/` (it is package data). ## Testing recipe - Unit-test pure functions directly (models, sessionlog, storage, scaffold). -- For the runner, call `run_experiment(exp, root=tmp, invoker=MockInvoker())` for a persisted - mock path, `run_experiment(exp, root=tmp, invoker=MockInvoker(solver=...))` for a success - path, and `dry_run_experiment(exp, root=tmp)` to exercise the ephemeral validating dry-run - (returns a `DryRunReport`, persists nothing). +- Use Pier config and job-output fixtures for CLI/storage/result tests; mock backend/auth preflights + instead of invoking Docker or Copilot. - Build synthetic `events.jsonl` dicts to test `parse_metrics` without any Copilot run. - Add or update focused offline tests for each behavior change. Good coverage is expected, especially around Pier config loading, result adaptation, CLI behavior, and session parsing. @@ -47,5 +46,5 @@ uv run ruff check . uv run pytest -q # optional end-to-end smoke test: uv run copilot-experiments init sandbox/demo --force -uv run copilot-experiments run --root sandbox/demo --dry-run +uv run copilot-experiments validate --root sandbox/demo ``` diff --git a/.github/extensions/experiment-repository-structure/extension.mjs b/.github/extensions/experiment-repository-structure/extension.mjs index d942782..cb72250 100644 --- a/.github/extensions/experiment-repository-structure/extension.mjs +++ b/.github/extensions/experiment-repository-structure/extension.mjs @@ -14,7 +14,7 @@ const structure = [ owner: "human + harness", source: "The git checkout that contains experiment definitions and generated outputs.", why: "Separates experiment authoring from the copilot-experiments harness repository.", - commands: ["copilot-experiments list", "copilot-experiments run --dry-run"], + commands: ["copilot-experiments list", "copilot-experiments validate"], }, { id: "experiments", @@ -26,7 +26,7 @@ const structure = [ owner: "experiment author", source: "Pier JobConfig YAML files.", why: "Defines what to run: tasks, agents, model settings, attempts, concurrency, and job_name.", - commands: ["copilot-experiments run --dry-run", "copilot-experiments run [job-name]"], + commands: ["copilot-experiments validate", "copilot-experiments run [job-name]"], }, { id: "job-yaml", @@ -50,7 +50,7 @@ const structure = [ owner: "experiment author", source: "Harbor/Pier task directories or imported task corpora.", why: "Keeps task instructions, environment setup, and verifiers close to the experiment repo.", - commands: ["copilot-experiments deepswe-import ", "copilot-experiments run --dry-run"], + commands: ["copilot-experiments deepswe-import ", "copilot-experiments validate"], }, { id: "task-dir", @@ -62,7 +62,7 @@ const structure = [ owner: "experiment author", source: "One task's prompt, environment, and verifier.", why: "A Pier job can point to individual tasks or datasets of many tasks.", - commands: ["copilot-experiments run --dry-run"], + commands: ["copilot-experiments validate"], }, { id: "task-instruction", @@ -122,7 +122,7 @@ const structure = [ owner: "Pier + harness", source: "Generated run outputs. This is now the primary execution tree.", why: "Keeps measured executions out of git while preserving all data needed to inspect a run.", - commands: ["copilot-experiments list", "copilot-experiments reindex"], + commands: ["copilot-experiments list"], }, { id: "job-group", @@ -162,7 +162,7 @@ const structure = [ owner: "copilot-experiments", source: "Small manifest with job_name, run_id, and job/run id.", why: "Pier's config sees the concrete run id as job_name; this manifest preserves the stable job identity.", - commands: ["copilot-experiments reindex"], + commands: [], }, { id: "run-config", @@ -185,7 +185,7 @@ const structure = [ badge: "Pier", owner: "Pier", source: "Job-level status, timings, and aggregate Pier stats.", - why: "Primary job status signal for show/list/reindex.", + why: "Primary job status signal for show and list.", commands: ["copilot-experiments show /"], }, { @@ -320,45 +320,9 @@ const structure = [ badge: "derived", owner: "copilot-experiments", source: "Generated from Pier result files and Copilot-native logs.", - why: "Gives the familiar variant/task aggregate shape for show and reports.", + why: "Gives the agent/task aggregate shape for show and reports.", commands: ["copilot-experiments show /"], }, - { - id: "results", - parent: "repo", - label: "results/", - path: "results/", - kind: "derived", - badge: "derived", - owner: "copilot-experiments", - source: "Derived index plus legacy Python experiment runs.", - why: "The SQLite index is rebuildable. Legacy run data remains readable during migration.", - commands: ["copilot-experiments reindex"], - }, - { - id: "index-db", - parent: "results", - label: "index.db", - path: "results/index.db", - kind: "derived", - badge: "cache", - owner: "copilot-experiments", - source: "SQLite cache derived from jobs/ and legacy results/.", - why: "Speeds up cross-run queries; never the source of truth.", - commands: ["copilot-experiments reindex"], - }, - { - id: "legacy-results", - parent: "results", - label: "//...", - path: "results///", - kind: "legacy", - badge: "legacy", - owner: "legacy harness", - source: "Older Python Experiment/Task/Variant runs.", - why: "Kept for migration and historical data; new Pier runs use jobs/.", - commands: ["copilot-experiments show ", "copilot-experiments analyze "], - }, { id: "guidance", parent: "repo", @@ -379,7 +343,7 @@ const flow = [ ["Run", "copilot-experiments run"], ["Concrete output", "jobs///"], ["Inspect/analyze", "show | inspect | analyze /"], - ["Derived cache", "results/index.db"], + ["Summarize", "summary.json / summary.md"], ]; function htmlEscape(value) { @@ -662,7 +626,6 @@ button { .analysis { background: var(--true-color-red, #cf222e); } .derived { background: var(--true-color-yellow, #9a6700); } .guidance { background: var(--text-color-muted, #57606a); } -.legacy { background: #8c959f; } .root { background: var(--text-color-default, #1f2328); } @media (max-width: 980px) { .flow, @@ -707,7 +670,7 @@ button {