From 2543699f316e2f722d939f9a6bc96ba805fd50f5 Mon Sep 17 00:00:00 2001
From: Dominique Broeglin <dominique.broeglin@microsoft.com>
Date: Sun, 28 Jun 2026 02:43:40 +0200
Subject: [PATCH 1/3] Nest Pier runs and make run selectors discoverable

Replace the awkward flat-then-suffixed Pier rerun scheme with a clean nested jobs/<job-name>/<run-id>/ layout that separates stable job identity from per-run timestamp ids. Expose copyable job-name/run-id selectors via list, and let show, inspect, and nalyze resolve an exact run from a selector (legacy run id, Pier job, or job/run). Add actionable not-found hints pointing back at list.

Persist stable job identity in a per-run manifest, derive selectors for the index, and update docs, experiment-repo templates, and ADR-0019. Commit a didactic project-scoped canvas that visualizes the verified structure.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../extension.mjs                             | 892 ++++++++++++++++++
 README.md                                     |  26 +-
 .../0019-use-nested-pier-run-directories.md   |  51 +
 docs/adr/README.md                            |   1 +
 docs/analysis.md                              |   8 +-
 docs/architecture.md                          |   4 +-
 docs/authoring-experiments.md                 |  11 +-
 docs/collecting-run-data.md                   |  38 +-
 docs/deepswe.md                               |   1 +
 docs/results-format.md                        |  59 +-
 src/copilot_experiments/cli.py                | 250 +++--
 src/copilot_experiments/index.py              |  43 +-
 src/copilot_experiments/pier_backend.py       |  65 +-
 src/copilot_experiments/pier_results.py       |  58 +-
 src/copilot_experiments/storage.py            |  87 +-
 .../.apm/skills/analyzing-results/SKILL.md    |   6 +-
 .../templates/experiment_repo/AGENTS.md.tmpl  |   4 +-
 .../templates/experiment_repo/README.md.tmpl  |  17 +-
 tests/test_pier_backend.py                    |  53 +-
 tests/test_pier_results.py                    |  75 +-
 tests/test_storage.py                         |  24 +-
 21 files changed, 1548 insertions(+), 225 deletions(-)
 create mode 100644 .github/extensions/experiment-repository-structure/extension.mjs
 create mode 100644 docs/adr/0019-use-nested-pier-run-directories.md
diff --git a/.github/extensions/experiment-repository-structure/extension.mjs b/.github/extensions/experiment-repository-structure/extension.mjs
new file mode 100644
index 0000000..d942782
--- /dev/null
+++ b/.github/extensions/experiment-repository-structure/extension.mjs
@@ -0,0 +1,892 @@
+import { createServer } from "node:http";
+import { createCanvas, joinSession } from "@github/copilot-sdk/extension";
+
+const servers = new Map();
+
+const structure = [
+    {
+        id: "repo",
+        parent: null,
+        label: "Experiment repository",
+        path: ".",
+        kind: "root",
+        badge: "workspace",
+        owner: "human + harness",
+        source: "The git checkout that contains experiment definitions and generated outputs.",
+        why: "Separates experiment authoring from the copilot-experiments harness repository.",
+        commands: ["copilot-experiments list", "copilot-experiments run --dry-run"],
+    },
+    {
+        id: "experiments",
+        parent: "repo",
+        label: "experiments/",
+        path: "experiments/",
+        kind: "source",
+        badge: "committed",
+        owner: "experiment author",
+        source: "Pier JobConfig YAML files.",
+        why: "Defines what to run: tasks, agents, model settings, attempts, concurrency, and job_name.",
+        commands: ["copilot-experiments run --dry-run", "copilot-experiments run [job-name]"],
+    },
+    {
+        id: "job-yaml",
+        parent: "experiments",
+        label: "<job>.yaml",
+        path: "experiments/<job>.yaml",
+        kind: "source",
+        badge: "committed",
+        owner: "experiment author",
+        source: "Stable Pier job configuration.",
+        why: "The job_name is the durable experiment identity. Repeated executions keep this identity but get new run ids.",
+        commands: ["copilot-experiments run <job-name>", "copilot-experiments run --resume <job-name>"],
+    },
+    {
+        id: "tasks",
+        parent: "repo",
+        label: "tasks/",
+        path: "tasks/",
+        kind: "task",
+        badge: "committed",
+        owner: "experiment author",
+        source: "Harbor/Pier task directories or imported task corpora.",
+        why: "Keeps task instructions, environment setup, and verifiers close to the experiment repo.",
+        commands: ["copilot-experiments deepswe-import <source>", "copilot-experiments run --dry-run"],
+    },
+    {
+        id: "task-dir",
+        parent: "tasks",
+        label: "<task>/",
+        path: "tasks/<task>/",
+        kind: "task",
+        badge: "committed",
+        owner: "experiment author",
+        source: "One task's prompt, environment, and verifier.",
+        why: "A Pier job can point to individual tasks or datasets of many tasks.",
+        commands: ["copilot-experiments run --dry-run"],
+    },
+    {
+        id: "task-instruction",
+        parent: "task-dir",
+        label: "instruction.md",
+        path: "tasks/<task>/instruction.md",
+        kind: "task",
+        badge: "committed",
+        owner: "experiment author",
+        source: "Prompt text presented to the evaluated agent.",
+        why: "This is the human-readable task objective.",
+        commands: [],
+    },
+    {
+        id: "task-toml",
+        parent: "task-dir",
+        label: "task.toml",
+        path: "tasks/<task>/task.toml",
+        kind: "task",
+        badge: "committed",
+        owner: "experiment author",
+        source: "Pier task metadata.",
+        why: "Connects instructions, environment, and verifier into a runnable task.",
+        commands: [],
+    },
+    {
+        id: "task-env",
+        parent: "task-dir",
+        label: "environment/",
+        path: "tasks/<task>/environment/",
+        kind: "task",
+        badge: "committed",
+        owner: "experiment author",
+        source: "Sandbox setup for the task.",
+        why: "Gives Pier a reproducible workspace for each trial.",
+        commands: [],
+    },
+    {
+        id: "task-tests",
+        parent: "task-dir",
+        label: "tests/",
+        path: "tasks/<task>/tests/",
+        kind: "task",
+        badge: "committed",
+        owner: "experiment author",
+        source: "Verifier inputs or grading scripts.",
+        why: "Turns an agent patch into an objective success signal.",
+        commands: [],
+    },
+    {
+        id: "jobs",
+        parent: "repo",
+        label: "jobs/",
+        path: "jobs/",
+        kind: "run",
+        badge: "gitignored",
+        owner: "Pier + harness",
+        source: "Generated run outputs. This is now the primary execution tree.",
+        why: "Keeps measured executions out of git while preserving all data needed to inspect a run.",
+        commands: ["copilot-experiments list", "copilot-experiments reindex"],
+    },
+    {
+        id: "job-group",
+        parent: "jobs",
+        label: "<job-name>/",
+        path: "jobs/<job-name>/",
+        kind: "run",
+        badge: "stable identity",
+        owner: "copilot-experiments",
+        source: "Grouping directory named from the configured Pier job_name.",
+        why: "A stable identity can contain many repeated measurements without inventing new job names.",
+        commands: ["copilot-experiments show <job-name>", "copilot-experiments inspect <job-name>"],
+    },
+    {
+        id: "run-dir",
+        parent: "job-group",
+        label: "<run-id>/",
+        path: "jobs/<job-name>/<run-id>/",
+        kind: "run",
+        badge: "generated",
+        owner: "Pier + harness",
+        source: "One concrete execution, usually timestamped like 20260620-153000.",
+        why: "This is the copyable run selector: <job-name>/<run-id>.",
+        commands: [
+            "copilot-experiments show <job-name>/<run-id>",
+            "copilot-experiments inspect <job-name>/<run-id>",
+            "copilot-experiments analyze <job-name>/<run-id> --trial 1",
+        ],
+    },
+    {
+        id: "run-manifest",
+        parent: "run-dir",
+        label: "copilot-experiments-run.json",
+        path: "jobs/<job-name>/<run-id>/copilot-experiments-run.json",
+        kind: "run",
+        badge: "harness",
+        owner: "copilot-experiments",
+        source: "Small manifest with job_name, run_id, and job/run id.",
+        why: "Pier's config sees the concrete run id as job_name; this manifest preserves the stable job identity.",
+        commands: ["copilot-experiments reindex"],
+    },
+    {
+        id: "run-config",
+        parent: "run-dir",
+        label: "config.json",
+        path: "jobs/<job-name>/<run-id>/config.json",
+        kind: "run",
+        badge: "Pier",
+        owner: "Pier",
+        source: "Resolved Pier job config for this concrete execution.",
+        why: "Captures exactly what Pier ran after path normalization and agent setup.",
+        commands: [],
+    },
+    {
+        id: "run-result",
+        parent: "run-dir",
+        label: "result.json",
+        path: "jobs/<job-name>/<run-id>/result.json",
+        kind: "run",
+        badge: "Pier",
+        owner: "Pier",
+        source: "Job-level status, timings, and aggregate Pier stats.",
+        why: "Primary job status signal for show/list/reindex.",
+        commands: ["copilot-experiments show <job-name>/<run-id>"],
+    },
+    {
+        id: "trial-dir",
+        parent: "run-dir",
+        label: "<trial-name>/",
+        path: "jobs/<job-name>/<run-id>/<trial-name>/",
+        kind: "trial",
+        badge: "generated",
+        owner: "Pier",
+        source: "One agent/task/attempt cell.",
+        why: "Contains the raw evidence for whether a task was solved and how the agent behaved.",
+        commands: ["copilot-experiments inspect <job-name>/<run-id> --trial 1"],
+    },
+    {
+        id: "trial-config",
+        parent: "trial-dir",
+        label: "config.json",
+        path: "jobs/<job-name>/<run-id>/<trial-name>/config.json",
+        kind: "trial",
+        badge: "Pier",
+        owner: "Pier",
+        source: "Resolved trial configuration.",
+        why: "Useful when comparing why two trial cells differ.",
+        commands: [],
+    },
+    {
+        id: "trial-result",
+        parent: "trial-dir",
+        label: "result.json",
+        path: "jobs/<job-name>/<run-id>/<trial-name>/result.json",
+        kind: "trial",
+        badge: "Pier",
+        owner: "Pier",
+        source: "Trial status, verifier reward, exceptions, agent info, and timings.",
+        why: "This is where harness failures and grading results are diagnosed.",
+        commands: ["copilot-experiments inspect <job-name>/<run-id> --trial 1"],
+    },
+    {
+        id: "agent",
+        parent: "trial-dir",
+        label: "agent/",
+        path: "jobs/<job-name>/<run-id>/<trial-name>/agent/",
+        kind: "analysis",
+        badge: "agent output",
+        owner: "copilot-cli agent",
+        source: "Outputs captured from the evaluated agent.",
+        why: "Raw agent evidence lives here; summaries are derived from these files.",
+        commands: ["copilot-experiments analyze <job-name>/<run-id> --trial 1"],
+    },
+    {
+        id: "trajectory",
+        parent: "agent",
+        label: "trajectory.json",
+        path: ".../agent/trajectory.json",
+        kind: "analysis",
+        badge: "ATIF",
+        owner: "copilot-cli agent",
+        source: "ATIF trajectory emitted by the installed agent.",
+        why: "Fallback analysis source when native Copilot session events are absent.",
+        commands: ["copilot-experiments analyze <job-name>/<run-id> --trial 1"],
+    },
+    {
+        id: "cli-jsonl",
+        parent: "agent",
+        label: "copilot-cli.jsonl / .txt",
+        path: ".../agent/copilot-cli.jsonl",
+        kind: "analysis",
+        badge: "diagnostic",
+        owner: "copilot-cli agent",
+        source: "Raw Copilot CLI output streams.",
+        why: "Useful for auth, invocation, or startup failures before a session log exists.",
+        commands: [],
+    },
+    {
+        id: "otel",
+        parent: "agent",
+        label: "copilot-otel.jsonl",
+        path: ".../agent/copilot-otel.jsonl",
+        kind: "analysis",
+        badge: "diagnostic",
+        owner: "copilot-cli agent",
+        source: "OpenTelemetry file exporter output for Copilot calls.",
+        why: "Enriches analysis with per-LLM-call metrics and AIU details.",
+        commands: ["copilot-experiments analyze --file <events.jsonl> --otel-file <copilot-otel.jsonl>"],
+    },
+    {
+        id: "session-events",
+        parent: "agent",
+        label: "copilot-session/**/events.jsonl",
+        path: ".../agent/copilot-session/<session-id>/events.jsonl",
+        kind: "analysis",
+        badge: "source of truth",
+        owner: "GitHub Copilot CLI",
+        source: "Native Copilot CLI session log.",
+        why: "Primary source for turns, tool calls, tokens, AIU, and rich analysis.",
+        commands: [
+            "copilot-experiments analyze <job-name>/<run-id> --trial 1",
+            "copilot-experiments analyze --file <events.jsonl>",
+        ],
+    },
+    {
+        id: "verifier",
+        parent: "trial-dir",
+        label: "verifier/",
+        path: ".../<trial-name>/verifier/",
+        kind: "trial",
+        badge: "Pier",
+        owner: "Pier verifier",
+        source: "Verifier outputs, rewards, and grading artifacts.",
+        why: "Connects agent behavior to the solved/unsolved measurement.",
+        commands: [],
+    },
+    {
+        id: "artifacts",
+        parent: "trial-dir",
+        label: "artifacts/",
+        path: ".../<trial-name>/artifacts/",
+        kind: "trial",
+        badge: "Pier",
+        owner: "Pier",
+        source: "Downloaded artifacts requested by the job config.",
+        why: "Keeps extra run evidence beside the trial that produced it.",
+        commands: [],
+    },
+    {
+        id: "summary",
+        parent: "run-dir",
+        label: "summary.json / summary.md",
+        path: "jobs/<job-name>/<run-id>/summary.json",
+        kind: "derived",
+        badge: "derived",
+        owner: "copilot-experiments",
+        source: "Generated from Pier result files and Copilot-native logs.",
+        why: "Gives the familiar variant/task aggregate shape for show and reports.",
+        commands: ["copilot-experiments show <job-name>/<run-id>"],
+    },
+    {
+        id: "results",
+        parent: "repo",
+        label: "results/",
+        path: "results/",
+        kind: "derived",
+        badge: "derived",
+        owner: "copilot-experiments",
+        source: "Derived index plus legacy Python experiment runs.",
+        why: "The SQLite index is rebuildable. Legacy run data remains readable during migration.",
+        commands: ["copilot-experiments reindex"],
+    },
+    {
+        id: "index-db",
+        parent: "results",
+        label: "index.db",
+        path: "results/index.db",
+        kind: "derived",
+        badge: "cache",
+        owner: "copilot-experiments",
+        source: "SQLite cache derived from jobs/ and legacy results/.",
+        why: "Speeds up cross-run queries; never the source of truth.",
+        commands: ["copilot-experiments reindex"],
+    },
+    {
+        id: "legacy-results",
+        parent: "results",
+        label: "<experiment>/<run-id>/...",
+        path: "results/<experiment>/<run-id>/",
+        kind: "legacy",
+        badge: "legacy",
+        owner: "legacy harness",
+        source: "Older Python Experiment/Task/Variant runs.",
+        why: "Kept for migration and historical data; new Pier runs use jobs/.",
+        commands: ["copilot-experiments show <run-id>", "copilot-experiments analyze <run-id>"],
+    },
+    {
+        id: "guidance",
+        parent: "repo",
+        label: "README.md / AGENTS.md / .apm/",
+        path: "README.md, AGENTS.md, .apm/",
+        kind: "guidance",
+        badge: "committed",
+        owner: "experiment author",
+        source: "Human and agent guidance for the experiment repo.",
+        why: "Makes the repo self-explanatory for people and for Copilot agents working inside it.",
+        commands: ["copilot-experiments list"],
+    },
+];
+
+const flow = [
+    ["Author task", "tasks/<task>/instruction.md"],
+    ["Define job", "experiments/<job>.yaml"],
+    ["Run", "copilot-experiments run"],
+    ["Concrete output", "jobs/<job-name>/<run-id>/"],
+    ["Inspect/analyze", "show | inspect | analyze <job-name>/<run-id>"],
+    ["Derived cache", "results/index.db"],
+];
+
+function htmlEscape(value) {
+    return String(value)
+        .replaceAll("&", "&amp;")
+        .replaceAll("<", "&lt;")
+        .replaceAll(">", "&gt;")
+        .replaceAll('"', "&quot;");
+}
+
+function renderHtml() {
+    const data = JSON.stringify(structure).replaceAll("<", "\\u003c");
+    const flowData = JSON.stringify(flow).replaceAll("<", "\\u003c");
+    return `<!doctype html>
+<html lang="en">
+<head>
+<meta charset="utf-8" />
+<meta name="viewport" content="width=device-width, initial-scale=1" />
+<title>Experiment repository structure</title>
+<style>
+:root {
+    color-scheme: light dark;
+}
+* {
+    box-sizing: border-box;
+}
+body {
+    margin: 0;
+    background: var(--background-color-default, #ffffff);
+    color: var(--text-color-default, #1f2328);
+    font-family: var(--font-sans, -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif);
+    font-size: var(--text-body-medium, 14px);
+    line-height: var(--leading-body-medium, 20px);
+}
+main {
+    min-height: 100vh;
+    display: grid;
+    grid-template-rows: auto auto 1fr;
+}
+header {
+    padding: 20px 24px 14px;
+    border-bottom: 1px solid var(--border-color-default, #d0d7de);
+    background: linear-gradient(135deg, var(--background-color-default, #fff), rgba(47, 129, 247, 0.08));
+}
+h1 {
+    margin: 0 0 8px;
+    font-family: var(--font-sans-display, var(--font-sans, inherit));
+    font-size: var(--text-title-large, 26px);
+    line-height: var(--leading-title-large, 32px);
+    font-weight: var(--font-weight-semibold, 600);
+}
+.lede {
+    max-width: 980px;
+    color: var(--text-color-muted, #57606a);
+    margin: 0;
+}
+.flow {
+    display: grid;
+    grid-template-columns: repeat(6, minmax(120px, 1fr));
+    gap: 8px;
+    padding: 14px 24px;
+    border-bottom: 1px solid var(--border-color-default, #d0d7de);
+    background: color-mix(in srgb, var(--background-color-default, #fff) 94%, var(--true-color-blue, #0969da));
+}
+.flow-step {
+    border: 1px solid var(--border-color-default, #d0d7de);
+    border-radius: 12px;
+    padding: 10px;
+    background: var(--background-color-default, #ffffff);
+    position: relative;
+    min-height: 82px;
+}
+.flow-step:not(:last-child)::after {
+    content: ">";
+    position: absolute;
+    right: -12px;
+    top: 31px;
+    color: var(--text-color-muted, #57606a);
+    font-weight: var(--font-weight-semibold, 600);
+}
+.flow-title {
+    display: block;
+    font-weight: var(--font-weight-semibold, 600);
+    margin-bottom: 6px;
+}
+.flow-path {
+    color: var(--text-color-muted, #57606a);
+    font-family: var(--font-mono, ui-monospace, SFMono-Regular, Consolas, monospace);
+    font-size: var(--text-code-inline, 12px);
+    overflow-wrap: anywhere;
+}
+.workspace {
+    display: grid;
+    grid-template-columns: minmax(360px, 48%) minmax(320px, 1fr);
+    min-height: 0;
+}
+.left {
+    border-right: 1px solid var(--border-color-default, #d0d7de);
+    min-width: 0;
+    overflow: auto;
+}
+.right {
+    min-width: 0;
+    overflow: auto;
+    background: color-mix(in srgb, var(--background-color-default, #fff) 97%, var(--true-color-blue, #0969da));
+}
+.controls {
+    position: sticky;
+    top: 0;
+    z-index: 1;
+    padding: 12px 16px;
+    border-bottom: 1px solid var(--border-color-default, #d0d7de);
+    background: var(--background-color-default, #ffffff);
+}
+.search {
+    width: 100%;
+    border: 1px solid var(--border-color-default, #d0d7de);
+    border-radius: 8px;
+    padding: 8px 10px;
+    color: var(--text-color-default, #1f2328);
+    background: var(--background-color-default, #ffffff);
+    font: inherit;
+}
+.chips {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 6px;
+    margin-top: 10px;
+}
+button {
+    font: inherit;
+}
+.chip {
+    border: 1px solid var(--border-color-default, #d0d7de);
+    color: var(--text-color-default, #1f2328);
+    background: var(--background-color-default, #ffffff);
+    border-radius: 999px;
+    padding: 4px 9px;
+    cursor: pointer;
+}
+.chip.active {
+    border-color: var(--true-color-blue, #0969da);
+    background: var(--true-color-blue-muted, rgba(47, 129, 247, 0.12));
+}
+.tree {
+    padding: 12px 12px 28px;
+}
+.node {
+    display: grid;
+    grid-template-columns: 20px 1fr auto;
+    align-items: center;
+    gap: 7px;
+    width: 100%;
+    border: 0;
+    border-radius: 8px;
+    background: transparent;
+    color: var(--text-color-default, #1f2328);
+    padding: 7px 8px;
+    text-align: left;
+    cursor: pointer;
+}
+.node:hover,
+.node.selected {
+    background: var(--true-color-blue-muted, rgba(47, 129, 247, 0.12));
+}
+.twisty {
+    color: var(--text-color-muted, #57606a);
+    width: 16px;
+    display: inline-block;
+}
+.label {
+    min-width: 0;
+}
+.label code,
+.path,
+.command {
+    font-family: var(--font-mono, ui-monospace, SFMono-Regular, Consolas, monospace);
+}
+.badge {
+    border-radius: 999px;
+    border: 1px solid var(--border-color-default, #d0d7de);
+    color: var(--text-color-muted, #57606a);
+    padding: 1px 7px;
+    font-size: 12px;
+}
+.children {
+    margin-left: 20px;
+    padding-left: 9px;
+    border-left: 1px solid var(--border-color-default, #d0d7de);
+}
+.detail {
+    padding: 22px 24px 36px;
+    max-width: 920px;
+}
+.detail h2 {
+    margin: 0 0 4px;
+    font-size: 22px;
+    line-height: 28px;
+}
+.detail .path {
+    display: inline-block;
+    margin: 6px 0 14px;
+    color: var(--text-color-muted, #57606a);
+    background: color-mix(in srgb, var(--background-color-default, #fff) 90%, var(--true-color-blue, #0969da));
+    border: 1px solid var(--border-color-default, #d0d7de);
+    border-radius: 8px;
+    padding: 5px 8px;
+    overflow-wrap: anywhere;
+}
+.meta {
+    display: grid;
+    grid-template-columns: repeat(3, minmax(0, 1fr));
+    gap: 8px;
+    margin: 10px 0 16px;
+}
+.meta-card,
+.callout,
+.commands {
+    border: 1px solid var(--border-color-default, #d0d7de);
+    border-radius: 12px;
+    background: var(--background-color-default, #ffffff);
+}
+.meta-card {
+    padding: 10px;
+}
+.meta-card strong {
+    display: block;
+    margin-bottom: 3px;
+}
+.meta-card span {
+    color: var(--text-color-muted, #57606a);
+}
+.callout {
+    padding: 13px 14px;
+    margin: 12px 0;
+}
+.callout strong {
+    display: block;
+    margin-bottom: 5px;
+}
+.commands {
+    margin-top: 12px;
+    padding: 12px;
+}
+.commands h3 {
+    margin: 0 0 8px;
+    font-size: 15px;
+}
+.command {
+    display: block;
+    padding: 7px 8px;
+    border-radius: 7px;
+    background: color-mix(in srgb, var(--background-color-default, #fff) 92%, var(--true-color-blue, #0969da));
+    margin: 6px 0;
+    overflow-wrap: anywhere;
+}
+.legend {
+    display: flex;
+    flex-wrap: wrap;
+    gap: 8px;
+    margin-top: 10px;
+}
+.legend span {
+    display: inline-flex;
+    align-items: center;
+    gap: 5px;
+    color: var(--text-color-muted, #57606a);
+    font-size: 12px;
+}
+.dot {
+    width: 9px;
+    height: 9px;
+    border-radius: 999px;
+    display: inline-block;
+}
+.source { background: var(--true-color-blue, #0969da); }
+.task { background: var(--true-color-green, #1a7f37); }
+.run { background: var(--true-color-purple, #8250df); }
+.trial { background: var(--true-color-orange, #bc4c00); }
+.analysis { background: var(--true-color-red, #cf222e); }
+.derived { background: var(--true-color-yellow, #9a6700); }
+.guidance { background: var(--text-color-muted, #57606a); }
+.legacy { background: #8c959f; }
+.root { background: var(--text-color-default, #1f2328); }
+@media (max-width: 980px) {
+    .flow,
+    .workspace {
+        grid-template-columns: 1fr;
+    }
+    .flow-step:not(:last-child)::after {
+        display: none;
+    }
+    .left {
+        border-right: 0;
+        border-bottom: 1px solid var(--border-color-default, #d0d7de);
+        max-height: 58vh;
+    }
+    .meta {
+        grid-template-columns: 1fr;
+    }
+}
+</style>
+</head>
+<body>
+<main>
+<header>
+    <h1>Experiment repository structure</h1>
+    <p class="lede">A didactic map of a Pier-first copilot-experiments repo. New runs use <code>jobs/&lt;job-name&gt;/&lt;run-id&gt;/</code>, and <code>copilot-experiments list</code> prints the selectors accepted by <code>show</code>, <code>inspect</code>, and <code>analyze</code>.</p>
+    <div class="legend" id="legend"></div>
+</header>
+<section class="flow" id="flow"></section>
+<section class="workspace">
+    <div class="left">
+        <div class="controls">
+            <input class="search" id="search" placeholder="Search paths, purpose, commands..." aria-label="Search structure" />
+            <div class="chips" id="chips"></div>
+        </div>
+        <div class="tree" id="tree"></div>
+    </div>
+    <div class="right">
+        <article class="detail" id="detail"></article>
+    </div>
+</section>
+</main>
+<script>
+const STRUCTURE = ${data};
+const FLOW = ${flowData};
+const KINDS = ["all", "source", "task", "run", "trial", "analysis", "derived", "guidance", "legacy"];
+let selectedKind = "all";
+let selectedId = "run-dir";
+let query = "";
+
+const byId = new Map(STRUCTURE.map(function(node) { return [node.id, node]; }));
+const children = new Map();
+for (const node of STRUCTURE) {
+    const key = node.parent || "__root__";
+    if (!children.has(key)) children.set(key, []);
+    children.get(key).push(node);
+}
+
+function esc(value) {
+    return String(value)
+        .replaceAll("&", "&amp;")
+        .replaceAll("<", "&lt;")
+        .replaceAll(">", "&gt;")
+        .replaceAll('"', "&quot;");
+}
+
+function matches(node) {
+    const haystack = [node.label, node.path, node.kind, node.badge, node.owner, node.source, node.why].concat(node.commands || []).join(" ").toLowerCase();
+    const kindOk = selectedKind === "all" || node.kind === selectedKind;
+    const queryOk = !query || haystack.includes(query.toLowerCase());
+    return kindOk && queryOk;
+}
+
+function branchHasMatch(node) {
+    if (matches(node)) return true;
+    return (children.get(node.id) || []).some(branchHasMatch);
+}
+
+function renderFlow() {
+    document.getElementById("flow").innerHTML = FLOW.map(function(step) {
+        return '<div class="flow-step"><span class="flow-title">' + esc(step[0]) + '</span><span class="flow-path">' + esc(step[1]) + '</span></div>';
+    }).join("");
+}
+
+function renderLegend() {
+    const kinds = ["source", "task", "run", "trial", "analysis", "derived", "guidance", "legacy"];
+    document.getElementById("legend").innerHTML = kinds.map(function(kind) {
+        return '<span><i class="dot ' + kind + '"></i>' + esc(kind) + '</span>';
+    }).join("");
+}
+
+function renderChips() {
+    document.getElementById("chips").innerHTML = KINDS.map(function(kind) {
+        const active = kind === selectedKind ? " active" : "";
+        return '<button class="chip' + active + '" data-kind="' + esc(kind) + '">' + esc(kind) + '</button>';
+    }).join("");
+    for (const button of document.querySelectorAll(".chip")) {
+        button.addEventListener("click", function() {
+            selectedKind = button.dataset.kind;
+            render();
+        });
+    }
+}
+
+function renderTreeNode(node) {
+    if (!branchHasMatch(node)) return "";
+    const kids = children.get(node.id) || [];
+    const selected = node.id === selectedId ? " selected" : "";
+    const visibleKids = kids.map(renderTreeNode).join("");
+    const twisty = kids.length ? "▾" : "";
+    return '<div class="branch">' +
+        '<button class="node' + selected + '" data-id="' + esc(node.id) + '">' +
+        '<span class="twisty">' + twisty + '</span>' +
+        '<span class="label"><span class="dot ' + esc(node.kind) + '"></span> <code>' + esc(node.label) + '</code></span>' +
+        '<span class="badge">' + esc(node.badge) + '</span>' +
+        '</button>' +
+        (visibleKids ? '<div class="children">' + visibleKids + '</div>' : '') +
+        '</div>';
+}
+
+function renderTree() {
+    const roots = children.get("__root__") || [];
+    document.getElementById("tree").innerHTML = roots.map(renderTreeNode).join("") || '<p class="lede">No matching nodes.</p>';
+    for (const button of document.querySelectorAll(".node")) {
+        button.addEventListener("click", function() {
+            selectedId = button.dataset.id;
+            renderDetail();
+            renderTree();
+        });
+    }
+}
+
+function renderDetail() {
+    const node = byId.get(selectedId) || STRUCTURE[0];
+    const commands = (node.commands || []).length
+        ? '<div class="commands"><h3>Useful command selectors</h3>' + node.commands.map(function(command) {
+            return '<code class="command">' + esc(command) + '</code>';
+        }).join("") + '</div>'
+        : '<div class="commands"><h3>Useful command selectors</h3><span class="lede">No direct command; this node supports nearby run or analysis commands.</span></div>';
+    document.getElementById("detail").innerHTML =
+        '<h2>' + esc(node.label) + '</h2>' +
+        '<code class="path">' + esc(node.path) + '</code>' +
+        '<div class="meta">' +
+        '<div class="meta-card"><strong>Category</strong><span>' + esc(node.kind) + '</span></div>' +
+        '<div class="meta-card"><strong>Owner</strong><span>' + esc(node.owner) + '</span></div>' +
+        '<div class="meta-card"><strong>Status</strong><span>' + esc(node.badge) + '</span></div>' +
+        '</div>' +
+        '<div class="callout"><strong>What it is</strong>' + esc(node.source) + '</div>' +
+        '<div class="callout"><strong>Why it exists</strong>' + esc(node.why) + '</div>' +
+        commands;
+}
+
+function render() {
+    renderChips();
+    renderTree();
+    renderDetail();
+}
+
+document.getElementById("search").addEventListener("input", function(event) {
+    query = event.target.value;
+    renderTree();
+});
+
+renderFlow();
+renderLegend();
+render();
+</script>
+</body>
+</html>`;
+}
+
+async function startServer(instanceId) {
+    const server = createServer((req, res) => {
+        if (req.url === "/structure.json") {
+            res.setHeader("Content-Type", "application/json; charset=utf-8");
+            res.end(JSON.stringify({ structure, flow }));
+            return;
+        }
+
+        res.setHeader("Content-Type", "text/html; charset=utf-8");
+        res.end(renderHtml(instanceId));
+    });
+    await new Promise((resolve) => server.listen(0, "127.0.0.1", resolve));
+    const address = server.address();
+    const port = typeof address === "object" && address ? address.port : 0;
+    return { server, url: `http://127.0.0.1:${port}/` };
+}
+
+await joinSession({
+    canvases: [
+        createCanvas({
+            id: "experiment-repository-structure",
+            displayName: "Experiment repository structure",
+            description: "Interactive didactic map of a Pier-first copilot-experiments repository layout.",
+            actions: [
+                {
+                    name: "summarize",
+                    description: "Return a concise summary of the experiment repository structure.",
+                    handler: async () => ({
+                        layout: "Pier runs live at jobs/<job-name>/<run-id>/.",
+                        selector: "Use copilot-experiments list, then pass job-name/run-id to show, inspect, or analyze.",
+                        sourceOfTruth: "jobs/ and legacy results/ on disk; results/index.db is derived.",
+                        nodes: structure.length,
+                    }),
+                },
+            ],
+            open: async (ctx) => {
+                let entry = servers.get(ctx.instanceId);
+                if (!entry) {
+                    entry = await startServer(ctx.instanceId);
+                    servers.set(ctx.instanceId, entry);
+                }
+                return {
+                    title: "Experiment repository structure",
+                    status: "Pier-first layout with copyable run selectors",
+                    url: entry.url,
+                };
+            },
+            onClose: async (ctx) => {
+                const entry = servers.get(ctx.instanceId);
+                if (entry) {
+                    servers.delete(ctx.instanceId);
+                    await new Promise((resolve) => entry.server.close(() => resolve()));
+                }
+            },
+        }),
+    ],
+});
diff --git a/README.md b/README.md
index 1e440d9..e602e70 100644
--- a/README.md
+++ b/README.md
@@ -15,7 +15,7 @@ flowchart LR
     J --> P["Pier backend\nsandbox + verifier + artifacts"]
     P --> A["copilot-cli installed agent\nreal copilot binary"]
     A --> S["native Copilot\ncopilot-session/**/events.jsonl"]
-    P --> O["jobs/<job>/\nPier result.json + trials"]
+    P --> O["jobs/<job>/<run-id>/\nPier result.json + trials"]
     S --> C["Copilot-native analysis\nAIU, tokens, tools, turns"]
     O --> R["summary.json / summary.md\nshow / inspect / analyze"]
     O --> I["results/index.db\nderived SQLite index"]
@@ -46,6 +46,7 @@ uv run copilot-experiments run --dry-run
 
 # run for real through Pier
 uv run copilot-experiments run
+uv run copilot-experiments list
 uv run copilot-experiments show --last
 uv run copilot-experiments analyze --last
 ```
@@ -60,6 +61,7 @@ export COPILOT_EXPERIMENTS_REPO=/path/to/github-copilot-lab
 uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments init my-experiments
 cd my-experiments
 uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments run --dry-run
+uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments list
 uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments show --last
 ```
 
@@ -74,10 +76,14 @@ login`) and a Pier-supported execution backend such as Docker. `run` preflights
 backend before creating a job; for Docker this checks the CLI, Compose plugin, and daemon
 connection so missing WSL integration fails before an empty Pier job is recorded.
 
-Each `run` is a new measurement. If the configured Pier `job_name` already exists under `jobs/`,
-`copilot-experiments` writes the rerun to a timestamped job name instead of silently reusing the
-completed job. Pass `--resume` only when you intentionally want Pier's native resume behavior for an
-interrupted job.
+Each `run` is a new measurement. The configured Pier `job_name` remains the stable experiment
+identity, while each execution gets a timestamped run directory under
+`jobs/<job_name>/<run-id>/`. Pass `--resume` only when you intentionally want to reuse the latest
+run directory for that job and let Pier skip already-completed trials.
+
+Use `copilot-experiments list` after a run to copy the selector for a concrete execution. Pier
+selectors use `job-name/run-id`; passing just `job-name` selects that job's latest run, while
+`--last` selects the most recent stored run overall.
 
 ## Bundled examples
 
@@ -96,13 +102,13 @@ uv run copilot-experiments analyze --root examples/tracer_bullet --last
 | --- | --- |
 | `init <dir>` | Scaffold a standalone Pier experiment repository. |
 | `deepswe-import <path>` | Generate a Pier job config for a cloned DeepSWE checkout, `tasks/` corpus, or single task. |
-| `run [name]` | Discover Pier job configs in `experiments/` and run them. Reruns create a fresh timestamped Pier job when the configured name already exists. Falls back to legacy Python experiments when no Pier configs exist. |
+| `run [name]` | Discover Pier job configs in `experiments/` and run them. Each run writes to a fresh `jobs/<job>/<run-id>/` directory. Falls back to legacy Python experiments when no Pier configs exist. |
 | `run --dry-run` | Validate Pier job configs, or run the legacy ephemeral mock dry-run for legacy experiments. |
 | `run --resume` | Resume an existing Pier job directory and skip already-completed matching trials. |
-| `list` | List Pier job configs, legacy experiments, and stored jobs/runs. |
-| `show <job>` / `show --last` | Print a summary for a Pier job or legacy run. |
-| `analyze <job>` / `analyze --last` / `analyze --file <events.jsonl>` | Render a rich overview of a native Copilot session log. |
-| `inspect <job>` | Drill into stored trials and status. |
+| `list` | List Pier job configs, legacy experiments, and copyable run selectors. |
+| `show <selector>` / `show --last` | Print a summary for a Pier run (`job` or `job/run`) or legacy run id. |
+| `analyze <selector>` / `analyze --last` / `analyze --file <events.jsonl>` | Render a rich overview of a native Copilot session log. |
+| `inspect <selector>` | Drill into stored trials and status for a Pier run (`job` or `job/run`) or legacy run id. |
 | `reindex` | Rebuild the derived SQLite index from `jobs/` and legacy `results/`. |
 
 ## Documentation
diff --git a/docs/adr/0019-use-nested-pier-run-directories.md b/docs/adr/0019-use-nested-pier-run-directories.md
new file mode 100644
index 0000000..1f49d28
--- /dev/null
+++ b/docs/adr/0019-use-nested-pier-run-directories.md
@@ -0,0 +1,51 @@
+# 0019. Use nested Pier run directories
+
+- **Status:** Accepted
+- **Date:** 2026-06-27
+- **Deciders:** Project maintainers
+
+## Context
+
+Pier names each job output directory from `job_name`. Re-running the same experiment with the same
+`job_name` would naturally target the same directory, while the previous harness behavior created
+the first run at `jobs/<job-name>/` and later reruns at timestamp-suffixed sibling directories such
+as `jobs/<job-name>-20260620-153000/`.
+
+That mixed stable identity and concrete execution identity in one string. It also made command-line
+lookup unclear: users could pass `--last`, but it was not obvious how to discover a run id, how to
+select an earlier run, or whether a suffixed directory was a new job or a rerun of the same job.
+
+The filesystem remains the source of truth, and `results/index.db` remains a derived cache. Existing
+flat Pier job directories must remain readable during migration.
+
+## Decision
+
+We will store new Pier executions under `jobs/<job-name>/<run-id>/`.
+
+The configured `job_name` is the stable experiment identity. Each concrete execution gets a
+timestamp run id, with numeric collision suffixes when needed. The harness runs Pier by setting
+Pier's `jobs_dir` to `jobs/<job-name>` and Pier's concrete `job_name` to the run id, then writes a
+`copilot-experiments-run.json` manifest into the job output so summaries, indexing, and lookup can
+recover the stable job name and concrete run id.
+
+The CLI will expose copyable selectors through `copilot-experiments list`:
+
+- `job-name/run-id` selects one exact Pier run.
+- `job-name` selects the latest run for that Pier job.
+- `--last` selects the most recent stored run overall.
+
+Legacy flat Pier jobs at `jobs/<job-name>/` remain discoverable and resumable.
+
+## Consequences
+
+The output tree now separates stable job identity from repeated measurements, so reruns are easier
+to compare and explain. `show`, `inspect`, and `analyze` can address exact runs without adding a
+parallel command family.
+
+The harness owns a small manifest file in each new Pier run directory because Pier's native
+`config.json` only knows the concrete run id once the job is launched. Discovery must avoid
+mistaking legacy flat job trial directories for nested runs; nested child directories under a legacy
+flat job are treated as runs only when they contain the harness manifest.
+
+Older flat jobs remain supported, but new documentation and generated experiment repos should teach
+the nested layout and `list`-driven selector workflow.
diff --git a/docs/adr/README.md b/docs/adr/README.md
index d3b3897..d2d933c 100644
--- a/docs/adr/README.md
+++ b/docs/adr/README.md
@@ -35,3 +35,4 @@ We follow the lightweight format popularized by
 | [0016](0016-use-deepswe-for-large-benchmark-protocols.md) | Use DeepSWE for large benchmark protocols | Accepted |
 | [0017](0017-import-deepswe-as-pier-dataset.md) | Import DeepSWE as a Pier dataset config | Accepted |
 | [0018](0018-adopt-pytest-cov-for-local-coverage-analysis.md) | Adopt pytest-cov for local coverage analysis | Accepted |
+| [0019](0019-use-nested-pier-run-directories.md) | Use nested Pier run directories | Accepted |
diff --git a/docs/analysis.md b/docs/analysis.md
index 5fe3c34..3e4b366 100644
--- a/docs/analysis.md
+++ b/docs/analysis.md
@@ -26,9 +26,15 @@ This page covers the second one and the `analyze` command that renders it.
 # Most recent Pier job (first trial by default)
 uv run copilot-experiments analyze --last
 
-# A specific Pier job / trial
+# Discover copyable selectors
+uv run copilot-experiments list
+
+# A specific Pier job's latest run / trial
 uv run copilot-experiments analyze tracer-bullet-textstats --trial 1
 
+# A specific Pier run / trial
+uv run copilot-experiments analyze tracer-bullet-textstats/20260620-153000 --trial 1
+
 # Any events.jsonl on disk — a stored trial log, or a live session under
 # ~/.copilot/session-state/<id>/events.jsonl
 uv run copilot-experiments analyze --file path/to/events.jsonl
diff --git a/docs/architecture.md b/docs/architecture.md
index 4a93272..938b656 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -19,7 +19,7 @@ flowchart TD
     AGENT --> ATIF["/logs/agent/trajectory.json"]
     OTEL --> ATIF
     JOB --> VERIFY["Pier verifier\ntests/test.sh -> reward.txt/json"]
-    JOB --> OUT["jobs/<job>/<trial>/"]
+    JOB --> OUT["jobs/<job>/<run-id>/<trial>/"]
     EVENTS --> ANALYSIS["sessionlog.py + analysis.py"]
     OTEL --> ANALYSIS
     ATIF --> FALLBACK["ATIF fallback metrics"]
@@ -75,7 +75,7 @@ During normalization, `name: copilot-cli` becomes
 
 ## Design invariants
 
-1. **Pier jobs are canonical.** `jobs/<job>/` is the primary source of truth for new runs.
+1. **Pier jobs are canonical.** `jobs/<job>/<run-id>/` is the primary source of truth for new runs.
 2. **SQLite is derived.** `results/index.db` can be rebuilt from `jobs/` and legacy `results/`.
 3. **Copilot logs are primary for Copilot metrics.** ATIF is a fallback and cross-agent view.
 4. **Copilot CLI is not reimplemented.** The installed agent shells out to the real CLI.
diff --git a/docs/authoring-experiments.md b/docs/authoring-experiments.md
index 24b6cb8..3f749f6 100644
--- a/docs/authoring-experiments.md
+++ b/docs/authoring-experiments.md
@@ -139,6 +139,7 @@ The generated config uses `datasets:` for a corpus and `tasks:` for a single tas
 ```bash
 uv run copilot-experiments run --dry-run
 uv run copilot-experiments run
+uv run copilot-experiments list
 uv run copilot-experiments show --last
 uv run copilot-experiments analyze --last --trial 1
 ```
@@ -152,6 +153,7 @@ export COPILOT_EXPERIMENTS_REPO=/path/to/github-copilot-lab
 
 uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments run --dry-run
 uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments run
+uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments list
 uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments show --last
 uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments analyze --last --trial 1
 ```
@@ -171,8 +173,13 @@ common WSL/Docker Desktop integration issues before a trial can fail without Cop
 
 Pier itself resumes existing matching job directories and skips trials that already have
 `result.json`. `copilot-experiments run` treats a plain rerun as a fresh measurement instead: when
-`jobs/<job_name>/` already exists, it appends a timestamp to the Pier job name for the new run. Pass
-`--resume` to opt into Pier's native resume behavior for interrupted jobs.
+the configured `job_name` is used as a stable grouping directory and each execution gets a
+timestamped run id under `jobs/<job_name>/<run-id>/`. Pass `--resume` to reuse the latest existing
+run directory for that job and opt into Pier's native skip-completed-trials behavior.
+
+After a run, `copilot-experiments list` prints copyable selectors. Use `job-name/run-id` to inspect
+or analyze an exact Pier execution, `job-name` for that job's latest run, or `--last` for the most
+recent stored run across all jobs.
 
 ## Legacy Python experiments
 
diff --git a/docs/collecting-run-data.md b/docs/collecting-run-data.md
index 43a42a8..c990dc4 100644
--- a/docs/collecting-run-data.md
+++ b/docs/collecting-run-data.md
@@ -3,7 +3,7 @@
 This page is the field guide for collecting the most complete record of what happened during a
 GitHub Copilot CLI session. The primary artifact is Copilot's native per-session JSONL event
 stream, `events.jsonl`. In the current Pier-first harness, that log is captured inside each Pier
-trial under `jobs/<job>/<trial>/agent/copilot-session/<session-id>/events.jsonl`.
+trial under `jobs/<job>/<run-id>/<trial>/agent/copilot-session/<session-id>/events.jsonl`.
 
 OpenTelemetry (OTel) export is a complementary live-observability path. ATIF
 `trajectory.json` is a cross-agent compatibility artifact and fallback, not a replacement for
@@ -80,25 +80,27 @@ After Pier downloads trial logs, the canonical experiment-repo layout is:
 ```text
 jobs/
   <job-name>/
-    config.json
-    result.json
-    summary.json          # derived by copilot-experiments
-    summary.md            # derived by copilot-experiments
-    <trial-name>/
+    <run-id>/
       config.json
       result.json
-      agent/
-        copilot-cli.jsonl
-        copilot-cli.txt
-        trajectory.json
-        copilot-otel.jsonl
-        copilot-session/
-          <session-id>/
-            events.jsonl
-      verifier/
-        reward.txt
-        reward.json
-      artifacts/
+      copilot-experiments-run.json
+      summary.json          # derived by copilot-experiments
+      summary.md            # derived by copilot-experiments
+      <trial-name>/
+        config.json
+        result.json
+        agent/
+          copilot-cli.jsonl
+          copilot-cli.txt
+          trajectory.json
+          copilot-otel.jsonl
+          copilot-session/
+            <session-id>/
+              events.jsonl
+        verifier/
+          reward.txt
+          reward.json
+        artifacts/
 ```
 
 `show`, `analyze`, `inspect`, and `reindex` read this tree. For Copilot trials they prefer native
diff --git a/docs/deepswe.md b/docs/deepswe.md
index 5aa5bcf..5b36fac 100644
--- a/docs/deepswe.md
+++ b/docs/deepswe.md
@@ -61,6 +61,7 @@ Validate and run it like any other Pier experiment:
 ```bash
 uv run copilot-experiments run --dry-run
 uv run copilot-experiments run deepswe-smoke
+uv run copilot-experiments list
 uv run copilot-experiments show --last
 uv run copilot-experiments analyze --last --trial 1
 ```
diff --git a/docs/results-format.md b/docs/results-format.md
index e81e98a..06e533a 100644
--- a/docs/results-format.md
+++ b/docs/results-format.md
@@ -11,36 +11,40 @@ For a source-by-source explanation of what can be captured around a Copilot CLI
 ```
 jobs/
   <job-name>/
-    config.json
-    result.json
-    summary.json          # written by copilot-experiments
-    summary.md            # written by copilot-experiments
-    <trial-name>/
+    <run-id>/
       config.json
       result.json
-      agent/
-        copilot-cli.jsonl
-        copilot-cli.txt
-        trajectory.json
-        copilot-otel.jsonl   # Copilot OTel file export, when no custom OTLP destination overrides it
-        copilot-session/
-          <session-id>/
-            events.jsonl
-      verifier/
-        reward.txt
-        reward.json
-      artifacts/
+      copilot-experiments-run.json
+      summary.json          # written by copilot-experiments
+      summary.md            # written by copilot-experiments
+      <trial-name>/
+        config.json
+        result.json
+        agent/
+          copilot-cli.jsonl
+          copilot-cli.txt
+          trajectory.json
+          copilot-otel.jsonl   # Copilot OTel file export, when no custom OTLP destination overrides it
+          copilot-session/
+            <session-id>/
+              events.jsonl
+        verifier/
+          reward.txt
+          reward.json
+        artifacts/
 ```
 
 Pier owns `config.json`, `result.json`, trial directories, logs, verifier outputs, and artifact
-download. `copilot-experiments` derives summaries and indexes from that tree.
+download. `copilot-experiments` adds `copilot-experiments-run.json` to preserve the stable
+`job_name` plus concrete `run_id`, then derives summaries and indexes from that tree.
 
 ## Key files
 
 | File | Meaning |
 | --- | --- |
-| `jobs/<job>/result.json` | Pier job-level status and stats. |
-| `jobs/<job>/<trial>/result.json` | Pier trial status, agent info, verifier result, exceptions, timings. |
+| `jobs/<job>/<run-id>/result.json` | Pier job-level status and stats for one execution. |
+| `jobs/<job>/<run-id>/copilot-experiments-run.json` | Stable job name and concrete run id used by summaries, lookup, and indexing. |
+| `jobs/<job>/<run-id>/<trial>/result.json` | Pier trial status, agent info, verifier result, exceptions, timings. |
 | `agent/trajectory.json` | ATIF trajectory emitted by the installed agent. Copilot agent steps include OTel per-LLM-call metrics when `copilot-otel.jsonl` is available; the file is also used as a fallback for non-Copilot agents. |
 | `agent/copilot-cli.jsonl` / `.txt` | Raw Copilot CLI output streams. Useful for auth or CLI failures. |
 | `agent/copilot-session/**/events.jsonl` | Native Copilot session log. Primary source for Copilot turns, tool calls, tokens, AIU, and analysis. |
@@ -72,9 +76,10 @@ their `results/<experiment>/<run>/.../trials/<NNN>/` layout.
 New Pier tables:
 
 ```sql
-pier_jobs(job_name PK, job_dir, started_at, finished_at, n_trials, success_rate, status)
-pier_trials(id PK, job_name, variant_slug, task_slug, trial_name, success, status,
-            n_turns, n_tool_calls, total_tokens, aiu, model, error)
+pier_jobs(id PK, job_name, run_id, job_dir, started_at, finished_at, n_trials,
+          success_rate, status)
+pier_trials(id PK, job_id, job_name, run_id, variant_slug, task_slug, trial_name,
+            success, status, n_turns, n_tool_calls, total_tokens, aiu, model, error)
 ```
 
 Legacy tables (`experiments`, `runs`, `variants`, `tasks`, `trials`) remain for old Python runs.
@@ -82,11 +87,17 @@ Legacy tables (`experiments`, `runs`, `variants`, `tasks`, `trials`) remain for
 ## Analyzing a trial
 
 ```bash
+uv run copilot-experiments list
 uv run copilot-experiments analyze --last --trial 1
 uv run copilot-experiments analyze <job-name> --trial 1
-uv run copilot-experiments analyze --file jobs/<job>/<trial>/agent/copilot-session/.../events.jsonl
+uv run copilot-experiments analyze <job-name>/<run-id> --trial 1
+uv run copilot-experiments analyze --file jobs/<job>/<run-id>/<trial>/agent/copilot-session/.../events.jsonl
 ```
 
+`list` is the discovery command for run ids. For Pier outputs, its `selector (job/run)` column is
+the exact string accepted by `show`, `inspect`, and `analyze`. Passing only `<job-name>` selects
+that job's latest run; passing `<job-name>/<run-id>` selects one concrete execution.
+
 If the selected Pier trial has no native Copilot `events.jsonl`, `analyze` falls back to
 `agent/trajectory.json` when present; otherwise it reports that no Copilot session log or
 trajectory is available. When Pier recorded a trial exception before the agent ran, `analyze`
diff --git a/src/copilot_experiments/cli.py b/src/copilot_experiments/cli.py
index fe858b5..4bd797d 100644
--- a/src/copilot_experiments/cli.py
+++ b/src/copilot_experiments/cli.py
@@ -5,7 +5,9 @@
 import importlib.util
 import sys
 from collections.abc import Callable
+from dataclasses import dataclass
 from pathlib import Path
+from typing import Literal
 
 import typer
 from rich.console import Console
@@ -29,7 +31,9 @@
 from .pier_results import (
     describe_missing_pier_analysis_source,
     iter_pier_trial_summaries,
+    pier_job_label,
     resolve_pier_trial_analysis_source,
+    write_pier_run_manifest,
     write_pier_summary,
 )
 from .render import render_session_analysis
@@ -67,6 +71,13 @@ def _force_utf8_streams() -> None:
 err = Console(stderr=True)
 
 
+@dataclass(frozen=True)
+class ResolvedRun:
+    kind: Literal["legacy", "pier"]
+    path: Path
+    selector: str
+
+
 # --------------------------------------------------------------------------- #
 # Experiment discovery
 # --------------------------------------------------------------------------- #
@@ -318,12 +329,13 @@ def run(
             if verbose:
                 prepared.config.debug = True
             inject_copilot_token(prepared.config, auth.token)
-            console.print(f"[bold]Running Pier job[/bold] {prepared.run_name}")
-            if prepared.renamed:
+            console.print(f"[bold]Running Pier job[/bold] {prepared.label}")
+            if prepared.resumed:
+                console.print(f"[dim]resume:[/dim] reusing existing Pier run {prepared.label}")
+            else:
                 console.print(
-                    f"[dim]existing job[/dim] {prepared.requested_name} "
-                    f"[dim]found; writing fresh rerun to[/dim] {prepared.run_name} "
-                    "[dim](use --resume to reuse the existing job)[/dim]"
+                    f"[dim]run:[/dim] writing fresh run to "
+                    f"{Path(prepared.config.jobs_dir) / prepared.run_name}"
                 )
             try:
                 run_result = run_pier_job(prepared.config)
@@ -331,6 +343,11 @@ def run(
                 err.print(f"[red]Pier job failed:[/red] {type(exc).__name__}: {exc}")
                 any_failures = True
                 continue
+            write_pier_run_manifest(
+                run_result.job_dir,
+                job_name=prepared.requested_name,
+                run_id=prepared.run_name,
+            )
             summary = write_pier_summary(run_result.job_dir)
             _print_run_summary(summary)
             _warn_failed_pier_trials(run_result.job_dir)
@@ -407,7 +424,7 @@ def run(
 def list_cmd(
     root: Path | None = typer.Option(None, "--root", help="Experiment repository root."),
 ) -> None:
-    """List experiments and past runs."""
+    """List experiment definitions and concrete run selectors."""
     root = Path(root or Path.cwd())
     layout = Layout(root)
     pier_specs = discover_pier_job_configs(root)
@@ -437,11 +454,10 @@ def list_cmd(
         console.print(table)
 
     runs = index_list_runs(layout)
-    if not runs:
-        console.print("[dim]No runs yet.[/dim]")
-    else:
-        table = Table(title="Runs")
-        table.add_column("run id")
+    pier_jobs = layout.iter_pier_jobs()
+    if runs:
+        table = Table(title="Experiment runs")
+        table.add_column("selector")
         table.add_column("experiment")
         table.add_column("started")
         table.add_column("trials", justify="right")
@@ -457,20 +473,24 @@ def list_cmd(
             )
         console.print(table)
 
-    pier_jobs = layout.iter_pier_jobs()
     if not pier_jobs:
+        if not runs:
+            console.print("[dim]No runs yet.[/dim]")
         return
-    table = Table(title="Runs")
-    table.add_column("pier job")
+    table = Table(title="Pier runs")
+    table.add_column("selector (job/run)", no_wrap=True)
     table.add_column("started")
     table.add_column("trials", justify="right")
+    table.add_column("success", justify="right")
     table.add_column("status")
     for job_dir in pier_jobs:
         summary = write_pier_summary(job_dir)
+        sr = summary.get("overall_success_rate")
         table.add_row(
-            job_dir.name,
+            str(summary.get("pier_job_id") or pier_job_label(job_dir)),
             (summary.get("started_at") or "")[:19],
             str(summary.get("n_trials") or 0),
+            "-" if sr is None else f"{sr * 100:.0f}%",
             str(summary.get("status") or "-"),
         )
     console.print(table)
@@ -478,58 +498,59 @@ def list_cmd(
 
 @app.command()
 def show(
-    run_id: str | None = typer.Argument(None, help="Run id or unique prefix."),
-    last: bool = typer.Option(False, "--last", help="Show the most recent run."),
+    selector: str | None = typer.Argument(
+        None,
+        help=(
+            "Run selector from `list`: run id/prefix for legacy runs, Pier job for that "
+            "job's latest run, or Pier job/run id."
+        ),
+    ),
+    last: bool = typer.Option(False, "--last", help="Show the most recent stored run."),
     root: Path | None = typer.Option(None, "--root", help="Experiment repository root."),
 ) -> None:
     """Print a run summary and per-variant comparison."""
     root = Path(root or Path.cwd())
     layout = Layout(root)
-    pier_job = _resolve_pier_job(layout, last=last, run_id=run_id)
-    run_dir = (
-        None
-        if last and pier_job is not None
-        else (layout.latest_run() if last else (layout.find_run(run_id) if run_id else None))
-    )
-    if run_dir is None:
-        if pier_job is not None:
-            summary = write_pier_summary(pier_job)
-            _print_run_summary(summary)
-            console.print(f"\n[dim]{pier_job / 'summary.md'}[/dim]")
-            return
-    if run_dir is None:
-        err.print("[red]Run not found.[/red] Pass a run id or --last.")
+    resolved = _resolve_run(layout, last=last, selector=selector)
+    if resolved is None:
+        _print_run_not_found(selector)
         raise typer.Exit(1)
-    _print_run_summary(read_json(run_dir / "summary.json"))
-    console.print(f"\n[dim]{run_dir / 'summary.md'}[/dim]")
+    if resolved.kind == "pier":
+        summary = write_pier_summary(resolved.path)
+        _print_run_summary(summary)
+        console.print(f"\n[dim]{resolved.path / 'summary.md'}[/dim]")
+        return
+    _print_run_summary(read_json(resolved.path / "summary.json"))
+    console.print(f"\n[dim]{resolved.path / 'summary.md'}[/dim]")
 
 
 @app.command()
 def inspect(
-    run_id: str | None = typer.Argument(None, help="Run id or unique prefix."),
+    selector: str | None = typer.Argument(
+        None,
+        help=(
+            "Run selector from `list`: run id/prefix for legacy runs, Pier job for that "
+            "job's latest run, or Pier job/run id."
+        ),
+    ),
     variant: str | None = typer.Option(None, "--variant", help="Variant slug."),
     task: str | None = typer.Option(None, "--task", help="Task slug."),
     trial: int | None = typer.Option(None, "--trial", help="Trial number."),
     events: int = typer.Option(20, "--events", help="Number of session events to show."),
-    last: bool = typer.Option(False, "--last", help="Inspect the most recent run."),
+    last: bool = typer.Option(False, "--last", help="Inspect the most recent stored run."),
     root: Path | None = typer.Option(None, "--root", help="Experiment repository root."),
 ) -> None:
     """Drill into a run's variants, tasks, trials, and session events."""
     root = Path(root or Path.cwd())
     layout = Layout(root)
-    pier_job = _resolve_pier_job(layout, last=last, run_id=run_id)
-    run_dir = (
-        None
-        if last and pier_job is not None
-        else (layout.latest_run() if last else (layout.find_run(run_id) if run_id else None))
-    )
-    if run_dir is None:
-        if pier_job is not None:
-            _inspect_pier_job(pier_job)
-            return
-    if run_dir is None:
-        err.print("[red]Run not found.[/red] Pass a run id or --last.")
+    resolved = _resolve_run(layout, last=last, selector=selector)
+    if resolved is None:
+        _print_run_not_found(selector)
         raise typer.Exit(1)
+    if resolved.kind == "pier":
+        _inspect_pier_job(resolved.path)
+        return
+    run_dir = resolved.path
 
     variants_dir = run_dir / "variants"
     if variant is None:
@@ -604,7 +625,13 @@ def inspect(
 
 @app.command()
 def analyze(
-    run_id: str | None = typer.Argument(None, help="Run id or unique prefix."),
+    selector: str | None = typer.Argument(
+        None,
+        help=(
+            "Run selector from `list`: run id/prefix for legacy runs, Pier job for that "
+            "job's latest run, or Pier job/run id."
+        ),
+    ),
     variant: str | None = typer.Option(None, "--variant", help="Variant slug (default: first)."),
     task: str | None = typer.Option(None, "--task", help="Task slug (default: first)."),
     trial: int | None = typer.Option(None, "--trial", help="Trial number (default: first)."),
@@ -614,7 +641,7 @@ def analyze(
     otel_file: Path | None = typer.Option(
         None, "--otel-file", help="Optional Copilot OTel JSONL file to enrich analysis."
     ),
-    last: bool = typer.Option(False, "--last", help="Analyze the most recent run."),
+    last: bool = typer.Option(False, "--last", help="Analyze the most recent stored run."),
     max_turns: int = typer.Option(0, "--max-turns", help="Limit timeline rows (0 = all)."),
     root: Path | None = typer.Option(None, "--root", help="Experiment repository root."),
 ) -> None:
@@ -632,37 +659,32 @@ def analyze(
 
     root = Path(root or Path.cwd())
     layout = Layout(root)
-    pier_job = _resolve_pier_job(layout, last=last, run_id=run_id)
-    run_dir = (
-        None
-        if last and pier_job is not None
-        else (layout.latest_run() if last else (layout.find_run(run_id) if run_id else None))
-    )
-    if run_dir is None:
-        if pier_job is not None:
-            source_path, label, source_kind, discovered_otel = resolve_pier_trial_analysis_source(
-                pier_job, trial
-            )
-            if source_path is None:
-                err.print(f"[red]No Copilot session log or trajectory found in[/red] {pier_job}")
-                diagnostic = describe_missing_pier_analysis_source(pier_job, trial)
-                if diagnostic:
-                    err.print(f"[yellow]{diagnostic}[/yellow]")
-                raise typer.Exit(1)
-            selected_otel = otel_file or discovered_otel
-            analysis = (
-                analyze_events(
-                    load_events(source_path),
-                    load_events(selected_otel) if selected_otel is not None else None,
-                )
-                if source_kind == "events"
-                else analyze_trajectory(read_json(source_path))
-            )
-            render_session_analysis(analysis, console, title=label, max_turns=max_turns)
-            return
-    if run_dir is None:
-        err.print("[red]Run not found.[/red] Pass a run id, --last, or --file.")
+    resolved = _resolve_run(layout, last=last, selector=selector)
+    if resolved is None:
+        _print_run_not_found(selector, file_hint=True)
         raise typer.Exit(1)
+    if resolved.kind == "pier":
+        source_path, label, source_kind, discovered_otel = resolve_pier_trial_analysis_source(
+            resolved.path, trial
+        )
+        if source_path is None:
+            err.print(f"[red]No Copilot session log or trajectory found in[/red] {resolved.path}")
+            diagnostic = describe_missing_pier_analysis_source(resolved.path, trial)
+            if diagnostic:
+                err.print(f"[yellow]{diagnostic}[/yellow]")
+            raise typer.Exit(1)
+        selected_otel = otel_file or discovered_otel
+        analysis = (
+            analyze_events(
+                load_events(source_path),
+                load_events(selected_otel) if selected_otel is not None else None,
+            )
+            if source_kind == "events"
+            else analyze_trajectory(read_json(source_path))
+        )
+        render_session_analysis(analysis, console, title=label, max_turns=max_turns)
+        return
+    run_dir = resolved.path
 
     events_path, label, discovered_otel = _resolve_trial_events(run_dir, variant, task, trial)
     if events_path is None:
@@ -740,14 +762,70 @@ def _resolve_trial_events(
     )
 
 
-def _resolve_pier_job(layout: Layout, *, last: bool, run_id: str | None) -> Path | None:
+def _resolve_run(layout: Layout, *, last: bool, selector: str | None) -> ResolvedRun | None:
     if last:
-        return layout.latest_pier_job()
-    if run_id:
-        return layout.find_pier_job(run_id)
+        return _latest_resolved_run(layout)
+    if selector is None:
+        return None
+
+    legacy = layout.find_run(selector)
+    pier = layout.find_pier_job(selector)
+    if pier is not None and (legacy is None or "/" in selector):
+        return ResolvedRun("pier", pier, pier_job_label(pier))
+    if legacy is not None:
+        return ResolvedRun("legacy", legacy, legacy.name)
+    if pier is not None:
+        return ResolvedRun("pier", pier, pier_job_label(pier))
     return None
 
 
+def _latest_resolved_run(layout: Layout) -> ResolvedRun | None:
+    candidates: list[tuple[str, str, ResolvedRun]] = []
+    for _experiment_slug, run_id, run_dir in layout.iter_runs():
+        candidates.append(
+            (_legacy_run_started_at(run_dir), run_id, ResolvedRun("legacy", run_dir, run_id))
+        )
+    for job_dir in layout.iter_pier_jobs():
+        selector = pier_job_label(job_dir)
+        candidates.append(
+            (_pier_run_started_at(job_dir), selector, ResolvedRun("pier", job_dir, selector))
+        )
+    if not candidates:
+        return None
+    return max(candidates, key=lambda candidate: (candidate[0], candidate[1]))[2]
+
+
+def _legacy_run_started_at(run_dir: Path) -> str:
+    summary_path = run_dir / "summary.json"
+    run_path = run_dir / "run.json"
+    if summary_path.exists():
+        return str(read_json(summary_path).get("started_at") or "")
+    if run_path.exists():
+        return str(read_json(run_path).get("started_at") or "")
+    return ""
+
+
+def _pier_run_started_at(job_dir: Path) -> str:
+    result_path = job_dir / "result.json"
+    if result_path.exists():
+        return str(read_json(result_path).get("started_at") or "")
+    return ""
+
+
+def _print_run_not_found(selector: str | None, *, file_hint: bool = False) -> None:
+    if selector:
+        err.print(f"[red]Run not found:[/red] {selector!r}")
+    else:
+        err.print("[red]Run not found.[/red] Pass a run selector or --last.")
+    hints = [
+        "Use `copilot-experiments list` to copy a selector.",
+        "Pier selectors look like `job-name/run-id`; `job-name` selects that job's latest run.",
+    ]
+    if file_hint:
+        hints.append("Use `--file path/to/events.jsonl` to analyze a session log directly.")
+    err.print("[dim]" + " ".join(hints) + "[/dim]")
+
+
 def _print_dry_run_report(report: DryRunReport) -> None:
     table = Table(title=f"Dry-run · {report.experiment}", show_lines=False)
     table.add_column("", justify="center", width=3)
@@ -851,7 +929,7 @@ def _warn_failed_pier_trials(job_dir: Path) -> None:
     if not problems:
         return
     err.print(
-        f"[yellow]Warning:[/yellow] Pier job [bold]{job_dir.name}[/bold] had "
+        f"[yellow]Warning:[/yellow] Pier job [bold]{pier_job_label(job_dir)}[/bold] had "
         f"{len(problems)} harness failure(s). Inspect the captured trial result:"
     )
     for line in problems:
@@ -860,11 +938,11 @@ def _warn_failed_pier_trials(job_dir: Path) -> None:
 
 def _inspect_pier_job(job_dir: Path) -> None:
     summary = write_pier_summary(job_dir)
-    console.print(f"[bold]Pier job[/bold]: {job_dir.name}")
+    console.print(f"[bold]Pier job[/bold]: {pier_job_label(job_dir)}")
     console.print(f"[bold]summary[/bold]: {job_dir / 'summary.json'}")
     _print_run_summary(summary)
 
-    table = Table(title=f"Trials in {job_dir.name}")
+    table = Table(title=f"Trials in {pier_job_label(job_dir)}")
     table.add_column("trial")
     table.add_column("status")
     table.add_column("success")
diff --git a/src/copilot_experiments/index.py b/src/copilot_experiments/index.py
index 1dfe6d1..16417b2 100644
--- a/src/copilot_experiments/index.py
+++ b/src/copilot_experiments/index.py
@@ -11,7 +11,7 @@
 from pathlib import Path
 
 from ._util import read_json
-from .pier_results import build_pier_summary, iter_pier_trial_summaries
+from .pier_results import build_pier_summary, iter_pier_trial_summaries, pier_job_identity
 from .storage import Layout
 
 SCHEMA = """
@@ -85,7 +85,9 @@
     error           TEXT
 );
 CREATE TABLE IF NOT EXISTS pier_jobs (
-    job_name       TEXT PRIMARY KEY,
+    id             TEXT PRIMARY KEY,
+    job_name       TEXT,
+    run_id         TEXT,
     job_dir        TEXT,
     started_at     TEXT,
     finished_at    TEXT,
@@ -95,7 +97,9 @@
 );
 CREATE TABLE IF NOT EXISTS pier_trials (
     id             INTEGER PRIMARY KEY AUTOINCREMENT,
+    job_id         TEXT,
     job_name       TEXT,
+    run_id         TEXT,
     variant_slug   TEXT,
     task_slug      TEXT,
     trial_name     TEXT,
@@ -124,6 +128,18 @@ def _migrate(conn: sqlite3.Connection) -> None:
         if column not in existing:
             conn.execute(ddl)
 
+    pier_job_columns = {row["name"] for row in conn.execute("PRAGMA table_info(pier_jobs)")}
+    pier_trial_columns = {row["name"] for row in conn.execute("PRAGMA table_info(pier_trials)")}
+    if (
+        pier_job_columns
+        and {"id", "run_id"} - pier_job_columns
+        or pier_trial_columns
+        and {"job_id", "run_id"} - pier_trial_columns
+    ):
+        conn.execute("DROP TABLE IF EXISTS pier_trials")
+        conn.execute("DROP TABLE IF EXISTS pier_jobs")
+        conn.executescript(SCHEMA)
+
 
 def connect(db_path: Path) -> sqlite3.Connection:
     db_path.parent.mkdir(parents=True, exist_ok=True)
@@ -256,15 +272,20 @@ def index_pier_job_dir(conn: sqlite3.Connection, job_dir: Path) -> None:
     """Insert (or replace) one Pier job into the derived index."""
 
     summary = build_pier_summary(job_dir)
-    job_name = job_dir.name
-    conn.execute("DELETE FROM pier_jobs WHERE job_name=?", (job_name,))
-    conn.execute("DELETE FROM pier_trials WHERE job_name=?", (job_name,))
+    identity = pier_job_identity(job_dir)
+    job_id = identity["id"]
+    job_name = identity["job_name"]
+    run_id = identity["run_id"]
+    conn.execute("DELETE FROM pier_jobs WHERE id=?", (job_id,))
+    conn.execute("DELETE FROM pier_trials WHERE job_id=?", (job_id,))
 
     conn.execute(
-        "INSERT INTO pier_jobs(job_name, job_dir, started_at, finished_at, n_trials, "
-        "success_rate, status) VALUES (?,?,?,?,?,?,?)",
+        "INSERT INTO pier_jobs(id, job_name, run_id, job_dir, started_at, finished_at, "
+        "n_trials, success_rate, status) VALUES (?,?,?,?,?,?,?,?,?)",
         (
+            job_id,
             job_name,
+            run_id,
             str(job_dir),
             summary.get("started_at"),
             summary.get("finished_at"),
@@ -277,11 +298,13 @@ def index_pier_job_dir(conn: sqlite3.Connection, job_dir: Path) -> None:
     for trial in iter_pier_trial_summaries(job_dir):
         metrics = trial.get("metrics") or {}
         conn.execute(
-            "INSERT INTO pier_trials(job_name, variant_slug, task_slug, trial_name, "
-            "success, status, n_turns, n_tool_calls, total_tokens, aiu, model, error) "
-            "VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
+            "INSERT INTO pier_trials(job_id, job_name, run_id, variant_slug, task_slug, "
+            "trial_name, success, status, n_turns, n_tool_calls, total_tokens, aiu, model, "
+            "error) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
             (
+                job_id,
                 job_name,
+                run_id,
                 trial.get("variant"),
                 trial.get("task"),
                 trial.get("trial_name"),
diff --git a/src/copilot_experiments/pier_backend.py b/src/copilot_experiments/pier_backend.py
index 3d6afa2..672238e 100644
--- a/src/copilot_experiments/pier_backend.py
+++ b/src/copilot_experiments/pier_backend.py
@@ -14,6 +14,7 @@
 import yaml
 
 from .pier_agents.copilot_cli import COPILOT_CLI_AGENT_NAME, CopilotCli
+from .pier_results import PIER_RUN_MANIFEST
 
 COPILOT_CLI_AGENT_IMPORT_PATH = CopilotCli.import_path()
 
@@ -40,16 +41,21 @@ class PierRunResult:
 
 @dataclass(frozen=True)
 class PreparedPierJob:
-    """A Pier config ready to run, plus any job-name adjustment made for freshness."""
+    """A Pier config ready to run, plus the stable job and concrete run identity."""
 
     config: Any
     requested_name: str
     run_name: str
+    resumed: bool = False
 
     @property
     def renamed(self) -> bool:
         return self.requested_name != self.run_name
 
+    @property
+    def label(self) -> str:
+        return f"{self.requested_name}/{self.run_name}"
+
 
 class PierBackendPreflightError(RuntimeError):
     """A Pier execution backend is not available before a job starts."""
@@ -157,28 +163,31 @@ def prepare_pier_job_for_run(
 ) -> PreparedPierJob:
     """Return a run-ready config.
 
-    Pier resumes an existing matching ``jobs/<job_name>`` directory and skips completed trials.
-    For an experiment harness, a plain ``run`` should create a new measurement instead, while
-    explicit ``--resume`` should preserve Pier's native behavior.
+    Pier treats ``jobs_dir / job_name`` as the job directory and resumes any completed
+    trials found there. The harness keeps the configured ``job_name`` as the stable
+    experiment identity, but points Pier at ``jobs/<job_name>/<run_id>`` so every
+    execution has a uniform run directory. Explicit ``--resume`` reuses the latest
+    known run for that stable job when one exists.
     """
 
     prepared = config.model_copy(deep=True)
     requested_name = str(prepared.job_name)
     if resume:
-        return PreparedPierJob(prepared, requested_name, requested_name)
-
-    requested_dir = _job_dir(prepared)
-    if not requested_dir.exists():
-        return PreparedPierJob(prepared, requested_name, requested_name)
-
-    stamp = (now or datetime.now()).strftime("%Y%m%d-%H%M%S")
-    base = f"{requested_name}-{stamp}"
-    run_name = base
+        existing = _latest_existing_run_dir(prepared)
+        if existing is not None:
+            prepared.jobs_dir = existing.parent
+            prepared.job_name = existing.name
+            return PreparedPierJob(prepared, requested_name, existing.name, resumed=True)
+
+    base_run_name = (now or datetime.now()).strftime("%Y%m%d-%H%M%S")
+    run_name = base_run_name
+    job_group_dir = Path(prepared.jobs_dir) / requested_name
     index = 2
-    while (Path(prepared.jobs_dir) / run_name).exists():
-        run_name = f"{base}-{index}"
+    while (job_group_dir / run_name).exists():
+        run_name = f"{base_run_name}-{index}"
         index += 1
 
+    prepared.jobs_dir = job_group_dir
     prepared.job_name = run_name
     return PreparedPierJob(prepared, requested_name, run_name)
 
@@ -205,6 +214,32 @@ def _job_dir(config: Any) -> Path:
     return Path(config.jobs_dir) / str(config.job_name)
 
 
+def _latest_existing_run_dir(config: Any) -> Path | None:
+    """Return the latest resumable run directory for a stable job config.
+
+    New runs live at ``jobs/<job_name>/<run_id>``. A pre-migration flat
+    ``jobs/<job_name>`` directory may also exist, so keep it resumable when no
+    nested run has been created yet.
+    """
+
+    flat_dir = _job_dir(config)
+    nested_root = flat_dir
+    nested = []
+    if nested_root.is_dir():
+        nested = sorted(
+            path
+            for path in nested_root.iterdir()
+            if path.is_dir()
+            and (path / "config.json").exists()
+            and (path / PIER_RUN_MANIFEST).exists()
+        )
+    if nested:
+        return nested[-1]
+    if flat_dir.is_dir() and (flat_dir / "config.json").exists():
+        return flat_dir
+    return None
+
+
 def _environment_type(config: Any) -> str:
     environment = getattr(config, "environment", None)
     value = getattr(environment, "type", None)
diff --git a/src/copilot_experiments/pier_results.py b/src/copilot_experiments/pier_results.py
index 6ddca31..2dd6be3 100644
--- a/src/copilot_experiments/pier_results.py
+++ b/src/copilot_experiments/pier_results.py
@@ -14,6 +14,7 @@
 from .sessionlog import load_events, parse_metrics
 
 AnalysisSource = Literal["events", "trajectory"]
+PIER_RUN_MANIFEST = "copilot-experiments-run.json"
 
 
 def iter_trial_dirs(job_dir: Path) -> list[Path]:
@@ -39,6 +40,7 @@ def build_pier_summary(job_dir: Path) -> dict[str, Any]:
     job_dir = Path(job_dir)
     job_result = read_json(job_dir / "result.json")
     job_config = read_json(job_dir / "config.json") if (job_dir / "config.json").exists() else {}
+    identity = pier_job_identity(job_dir, job_config)
 
     variant_cells: dict[str, dict[str, Any]] = {}
     for row in iter_pier_trial_summaries(job_dir):
@@ -78,9 +80,10 @@ def build_pier_summary(job_dir: Path) -> dict[str, Any]:
     total_aiu = sum((trial.get("metrics") or {}).get("aiu") or 0 for trial in all_trials)
 
     summary = {
-        "run_id": job_dir.name,
-        "experiment": job_config.get("job_name") or job_dir.name,
-        "experiment_slug": job_dir.name,
+        "run_id": identity["run_id"],
+        "experiment": identity["job_name"],
+        "experiment_slug": identity["job_name"],
+        "pier_job_id": identity["id"],
         "started_at": job_result.get("started_at"),
         "finished_at": job_result.get("finished_at"),
         "status": _job_status(job_result),
@@ -110,14 +113,55 @@ def write_pier_summary(job_dir: Path) -> dict[str, Any]:
     return summary
 
 
+def write_pier_run_manifest(job_dir: Path, *, job_name: str, run_id: str) -> None:
+    """Persist the stable job identity beside Pier's run artifacts."""
+
+    write_json(
+        Path(job_dir) / PIER_RUN_MANIFEST,
+        {
+            "schema_version": 1,
+            "job_name": job_name,
+            "run_id": run_id,
+            "id": f"{job_name}/{run_id}",
+        },
+    )
+
+
+def pier_job_identity(job_dir: Path, job_config: dict[str, Any] | None = None) -> dict[str, str]:
+    """Return stable job identity and concrete run id for a Pier output directory."""
+
+    job_dir = Path(job_dir)
+    manifest_path = job_dir / PIER_RUN_MANIFEST
+    if manifest_path.exists():
+        manifest = read_json(manifest_path)
+        job_name = str(manifest.get("job_name") or job_dir.parent.name)
+        run_id = str(manifest.get("run_id") or job_dir.name)
+        return {"job_name": job_name, "run_id": run_id, "id": f"{job_name}/{run_id}"}
+
+    if job_dir.parent.parent.name == "jobs":
+        job_name = job_dir.parent.name
+        run_id = job_dir.name
+        return {"job_name": job_name, "run_id": run_id, "id": f"{job_name}/{run_id}"}
+
+    config = job_config or (
+        read_json(job_dir / "config.json") if (job_dir / "config.json").exists() else {}
+    )
+    job_name = str(config.get("job_name") or job_dir.name)
+    return {"job_name": job_name, "run_id": job_dir.name, "id": job_dir.name}
+
+
+def pier_job_label(job_dir: Path) -> str:
+    return pier_job_identity(job_dir)["id"]
+
+
 def resolve_pier_trial_events(
     job_dir: Path, trial: int | str | None = None
 ) -> tuple[Path | None, str]:
     trial_dir = _resolve_trial_dir(job_dir, trial)
     if trial_dir is None:
-        return None, Path(job_dir).name
+        return None, pier_job_label(job_dir)
     events = find_copilot_session_events(trial_dir / "agent")
-    return events, f"{Path(job_dir).name} · {trial_dir.name}"
+    return events, f"{pier_job_label(job_dir)} · {trial_dir.name}"
 
 
 def resolve_pier_trial_analysis_source(
@@ -125,9 +169,9 @@ def resolve_pier_trial_analysis_source(
 ) -> tuple[Path | None, str, AnalysisSource | None, Path | None]:
     trial_dir = _resolve_trial_dir(job_dir, trial)
     if trial_dir is None:
-        return None, Path(job_dir).name, None, None
+        return None, pier_job_label(job_dir), None, None
 
-    label = f"{Path(job_dir).name} · {trial_dir.name}"
+    label = f"{pier_job_label(job_dir)} · {trial_dir.name}"
     agent_dir = trial_dir / "agent"
     events = find_copilot_session_events(agent_dir)
     if events is not None:
diff --git a/src/copilot_experiments/storage.py b/src/copilot_experiments/storage.py
index db8799d..9b95785 100644
--- a/src/copilot_experiments/storage.py
+++ b/src/copilot_experiments/storage.py
@@ -8,18 +8,19 @@
 
     jobs/
       <job-name>/
-        config.json
-        result.json
-        <trial-name>/
+        <run-id>/
           config.json
           result.json
-          agent/
-            trajectory.json
-            copilot-cli.jsonl
-            copilot-otel.jsonl
-            copilot-session/**/events.jsonl
-          verifier/
-          artifacts/
+          <trial-name>/
+            config.json
+            result.json
+            agent/
+              trajectory.json
+              copilot-cli.jsonl
+              copilot-otel.jsonl
+              copilot-session/**/events.jsonl
+            verifier/
+            artifacts/
 
 Legacy layout (inside an experiment repository)::
 
@@ -55,6 +56,8 @@
 
 from pathlib import Path
 
+from .pier_results import PIER_RUN_MANIFEST
+
 
 class Layout:
     """Resolves the standard result paths for an experiment repository.
@@ -139,30 +142,68 @@ def latest_run(self) -> Path | None:
 
     # --- Pier discovery helpers ------------------------------------------- #
     def iter_pier_jobs(self) -> list[Path]:
-        """Yield Pier job directories under ``jobs/``.
+        """Yield Pier run directories under ``jobs/``.
 
-        A Pier job directory is identified by the stable pair ``config.json`` and
-        ``result.json``. The SQLite index remains under ``results/`` because it is
-        a derived cache owned by this project, not by Pier.
+        New runs live at ``jobs/<job-name>/<run-id>/``. Pre-migration flat
+        ``jobs/<job-name>/`` directories are still recognized for existing data.
+        A Pier run directory is identified by the stable pair ``config.json`` and
+        ``result.json``. The SQLite index remains under ``results/`` because it
+        is a derived cache owned by this project, not by Pier.
         """
 
         if not self.jobs_dir.exists():
             return []
-        return sorted(
-            path
-            for path in self.jobs_dir.iterdir()
-            if path.is_dir() and (path / "config.json").exists() and (path / "result.json").exists()
-        )
+        found: list[Path] = []
+        for path in sorted(p for p in self.jobs_dir.iterdir() if p.is_dir()):
+            is_flat_job = self._is_pier_job_dir(path)
+            if is_flat_job:
+                found.append(path)
+            found.extend(
+                child
+                for child in sorted(p for p in path.iterdir() if p.is_dir())
+                if self._is_pier_job_dir(child)
+                and (not is_flat_job or (child / PIER_RUN_MANIFEST).exists())
+            )
+        return sorted(found, key=self._pier_job_sort_key)
 
     def find_pier_job(self, job_name: str) -> Path | None:
-        """Locate a Pier job by exact name or unique prefix."""
+        """Locate a Pier run by job name, run id, ``job/run`` id, or unique prefix."""
 
-        matches = [path for path in self.iter_pier_jobs() if path.name == job_name]
-        if matches:
+        jobs = self.iter_pier_jobs()
+        group = self.jobs_dir / job_name
+        group_runs = [path for path in jobs if path.parent == group]
+        if group_runs:
+            return group_runs[-1]
+
+        matches = [
+            path for path in jobs if path.name == job_name or self.pier_job_key(path) == job_name
+        ]
+        if len(matches) == 1:
             return matches[0]
-        prefix = [path for path in self.iter_pier_jobs() if path.name.startswith(job_name)]
+        prefix = [
+            path
+            for path in jobs
+            if path.name.startswith(job_name) or self.pier_job_key(path).startswith(job_name)
+        ]
         return prefix[0] if len(prefix) == 1 else None
 
     def latest_pier_job(self) -> Path | None:
         jobs = self.iter_pier_jobs()
         return jobs[-1] if jobs else None
+
+    def pier_job_key(self, job_dir: Path) -> str:
+        """Return ``job/run`` for nested runs and the directory name for legacy flat jobs."""
+
+        job_dir = Path(job_dir)
+        if job_dir.parent.parent == self.jobs_dir:
+            return f"{job_dir.parent.name}/{job_dir.name}"
+        return job_dir.name
+
+    @staticmethod
+    def _is_pier_job_dir(path: Path) -> bool:
+        return (path / "config.json").exists() and (path / "result.json").exists()
+
+    def _pier_job_sort_key(self, path: Path) -> tuple[int, str, str]:
+        if path.parent.parent == self.jobs_dir:
+            return (1, path.name, path.parent.name)
+        return (0, path.name, path.name)
diff --git a/src/copilot_experiments/templates/experiment_repo/.apm/skills/analyzing-results/SKILL.md b/src/copilot_experiments/templates/experiment_repo/.apm/skills/analyzing-results/SKILL.md
index 58e7ce7..171c241 100644
--- a/src/copilot_experiments/templates/experiment_repo/.apm/skills/analyzing-results/SKILL.md
+++ b/src/copilot_experiments/templates/experiment_repo/.apm/skills/analyzing-results/SKILL.md
@@ -10,9 +10,10 @@ description: >-
 
 ## Filesystem layout
 ```
-jobs/<job-name>/
+jobs/<job-name>/<run-id>/
   config.json         # resolved Pier job config
   result.json         # Pier job result
+  copilot-experiments-run.json
   summary.json        # derived copilot-experiments summary
   summary.md          # human-readable report
   <trial-name>/
@@ -31,7 +32,8 @@ tokens, and AIU economics.
 ```bash
 copilot-experiments list                 # runs + success rates
 copilot-experiments show --last          # per-variant comparison table
-copilot-experiments inspect <job-name>   # list Pier trials
+copilot-experiments inspect <job-name>   # latest run for that Pier job
+copilot-experiments inspect <job-name>/<run-id>  # exact run selector from list
 copilot-experiments analyze --last       # render native Copilot events
 copilot-experiments reindex              # rebuild results/index.db
 ```
diff --git a/src/copilot_experiments/templates/experiment_repo/AGENTS.md.tmpl b/src/copilot_experiments/templates/experiment_repo/AGENTS.md.tmpl
index 09eefa8..87db7cc 100644
--- a/src/copilot_experiments/templates/experiment_repo/AGENTS.md.tmpl
+++ b/src/copilot_experiments/templates/experiment_repo/AGENTS.md.tmpl
@@ -37,6 +37,6 @@ uv sync
 uv run copilot-experiments run [--dry-run]
 uv run copilot-experiments list
 uv run copilot-experiments show --last
-uv run copilot-experiments inspect <job-name> --trial <n>
-uv run copilot-experiments analyze <job-name> --trial <n>
+uv run copilot-experiments inspect <job-name>/<run-id> --trial <n>
+uv run copilot-experiments analyze <job-name>/<run-id> --trial <n>
 ```
diff --git a/src/copilot_experiments/templates/experiment_repo/README.md.tmpl b/src/copilot_experiments/templates/experiment_repo/README.md.tmpl
index 5c3d9ef..424cc7a 100644
--- a/src/copilot_experiments/templates/experiment_repo/README.md.tmpl
+++ b/src/copilot_experiments/templates/experiment_repo/README.md.tmpl
@@ -8,7 +8,7 @@ GitHub Copilot research experiments, powered by
 ```
 experiments/        # Pier JobConfig YAML files
 tasks/              # Harbor/Pier task directories
-jobs/               # Pier job outputs (gitignored)
+jobs/               # Pier job/run outputs (gitignored)
 results/            # derived SQLite index for queries (gitignored)
 .apm/               # APM agent context (instructions, skills, prompts)
 ```
@@ -27,12 +27,13 @@ uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments run --dry-run
 
 # run for real through Pier (requires Copilot auth and a supported Pier backend)
 uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments run
+uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments list
 uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments show --last
 
 # explore results
 uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments list
-uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments inspect <job-name> --trial 1
-uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments analyze <job-name> --trial 1
+uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments inspect <job-name>/<run-id> --trial 1
+uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments analyze <job-name>/<run-id> --trial 1
 ```
 
 In PowerShell, use
@@ -55,12 +56,13 @@ uv run copilot-experiments run --dry-run
 
 # run for real through Pier (requires Copilot auth and a supported Pier backend)
 uv run copilot-experiments run
+uv run copilot-experiments list
 uv run copilot-experiments show --last
 
 # explore results
 uv run copilot-experiments list
-uv run copilot-experiments inspect <job-name> --trial 1
-uv run copilot-experiments analyze <job-name> --trial 1
+uv run copilot-experiments inspect <job-name>/<run-id> --trial 1
+uv run copilot-experiments analyze <job-name>/<run-id> --trial 1
 ```
 
 ## Writing experiments
@@ -73,6 +75,11 @@ attempts, concurrency, and artifacts. The bundled `copilot-cli` agent runs the r
 Copilot CLI inside the Pier environment and captures both ATIF and native Copilot
 `events.jsonl` logs.
 
+Runs are written under `jobs/<job-name>/<run-id>/`, where `<job-name>` comes from the stable
+`job_name` in `experiments/*.yaml` and `<run-id>` is a timestamp for one execution.
+Use `copilot-experiments list` to copy selectors. `<job-name>` selects that job's latest run;
+`<job-name>/<run-id>` selects one exact run.
+
 ## Agent context (APM)
 
 This repo uses [APM](https://github.com/microsoft/apm) to manage Copilot context.
diff --git a/tests/test_pier_backend.py b/tests/test_pier_backend.py
index 23ad902..4623ddc 100644
--- a/tests/test_pier_backend.py
+++ b/tests/test_pier_backend.py
@@ -81,24 +81,28 @@ def test_inject_copilot_token_only_updates_copilot_agents(tmp_path: Path):
     assert config.agents[1].env == {}
 
 
-def test_prepare_pier_job_for_run_keeps_first_run_name(tmp_path: Path):
+def test_prepare_pier_job_for_run_creates_timestamped_run_under_job_group(tmp_path: Path):
     config_path = tmp_path / "job.yaml"
     config_path.write_text("job_name: smoke\njobs_dir: jobs\n", encoding="utf-8")
     config = load_pier_job_config(config_path, root=tmp_path)
 
-    prepared = prepare_pier_job_for_run(config)
+    prepared = prepare_pier_job_for_run(config, now=datetime(2026, 6, 20, 15, 30, 0))
 
     assert prepared.requested_name == "smoke"
-    assert prepared.run_name == "smoke"
-    assert not prepared.renamed
+    assert prepared.run_name == "20260620-153000"
+    assert prepared.label == "smoke/20260620-153000"
+    assert prepared.config.jobs_dir == tmp_path / "jobs" / "smoke"
+    assert prepared.config.job_name == "20260620-153000"
+    assert prepared.renamed
+    assert not prepared.resumed
     assert config.job_name == "smoke"
 
 
-def test_prepare_pier_job_for_run_uses_fresh_name_when_job_exists(tmp_path: Path):
+def test_prepare_pier_job_for_run_uses_collision_suffix_when_run_exists(tmp_path: Path):
     config_path = tmp_path / "job.yaml"
     config_path.write_text("job_name: smoke\njobs_dir: jobs\n", encoding="utf-8")
     config = load_pier_job_config(config_path, root=tmp_path)
-    (tmp_path / "jobs" / "smoke").mkdir(parents=True)
+    (tmp_path / "jobs" / "smoke" / "20260620-153000").mkdir(parents=True)
 
     prepared = prepare_pier_job_for_run(
         config,
@@ -106,22 +110,49 @@ def test_prepare_pier_job_for_run_uses_fresh_name_when_job_exists(tmp_path: Path
     )
 
     assert prepared.requested_name == "smoke"
-    assert prepared.run_name == "smoke-20260620-153000"
-    assert prepared.config.job_name == "smoke-20260620-153000"
+    assert prepared.run_name == "20260620-153000-2"
+    assert prepared.config.jobs_dir == tmp_path / "jobs" / "smoke"
+    assert prepared.config.job_name == "20260620-153000-2"
     assert prepared.renamed
     assert config.job_name == "smoke"
 
 
-def test_prepare_pier_job_for_run_resume_keeps_existing_name(tmp_path: Path):
+def test_prepare_pier_job_for_run_resume_uses_latest_nested_run(tmp_path: Path):
+    config_path = tmp_path / "job.yaml"
+    config_path.write_text("job_name: smoke\njobs_dir: jobs\n", encoding="utf-8")
+    config = load_pier_job_config(config_path, root=tmp_path)
+    old_run = tmp_path / "jobs" / "smoke" / "20260620-153000"
+    latest_run = tmp_path / "jobs" / "smoke" / "20260620-160000"
+    old_run.mkdir(parents=True)
+    latest_run.mkdir()
+    (old_run / "config.json").write_text("{}", encoding="utf-8")
+    (old_run / "copilot-experiments-run.json").write_text("{}", encoding="utf-8")
+    (latest_run / "config.json").write_text("{}", encoding="utf-8")
+    (latest_run / "copilot-experiments-run.json").write_text("{}", encoding="utf-8")
+
+    prepared = prepare_pier_job_for_run(config, resume=True)
+
+    assert prepared.requested_name == "smoke"
+    assert prepared.run_name == "20260620-160000"
+    assert prepared.config.jobs_dir == tmp_path / "jobs" / "smoke"
+    assert prepared.config.job_name == "20260620-160000"
+    assert prepared.resumed
+
+
+def test_prepare_pier_job_for_run_resume_supports_legacy_flat_job(tmp_path: Path):
     config_path = tmp_path / "job.yaml"
     config_path.write_text("job_name: smoke\njobs_dir: jobs\n", encoding="utf-8")
     config = load_pier_job_config(config_path, root=tmp_path)
-    (tmp_path / "jobs" / "smoke").mkdir(parents=True)
+    legacy_job = tmp_path / "jobs" / "smoke"
+    legacy_job.mkdir(parents=True)
+    (legacy_job / "config.json").write_text("{}", encoding="utf-8")
 
     prepared = prepare_pier_job_for_run(config, resume=True)
 
     assert prepared.run_name == "smoke"
-    assert not prepared.renamed
+    assert prepared.config.jobs_dir == tmp_path / "jobs"
+    assert prepared.config.job_name == "smoke"
+    assert prepared.resumed
 
 
 def test_preflight_pier_backend_reports_missing_docker(
diff --git a/tests/test_pier_results.py b/tests/test_pier_results.py
index d08b46e..bd30cdb 100644
--- a/tests/test_pier_results.py
+++ b/tests/test_pier_results.py
@@ -12,7 +12,9 @@
 from copilot_experiments.pier_results import (
     build_pier_summary,
     describe_missing_pier_analysis_source,
+    pier_job_identity,
     resolve_pier_trial_events,
+    write_pier_run_manifest,
     write_pier_summary,
 )
 
@@ -216,6 +218,23 @@ def test_build_pier_summary_reads_native_copilot_events(tmp_path: Path):
     assert variant["tasks"][0]["task"] == "textstats"
 
 
+def test_build_pier_summary_reads_nested_run_identity(tmp_path: Path):
+    job_dir = _make_pier_job(tmp_path / "jobs" / "demo-job" / "20260620-153000")
+    write_pier_run_manifest(job_dir, job_name="demo-job", run_id="20260620-153000")
+
+    summary = build_pier_summary(job_dir)
+
+    assert summary["experiment"] == "demo-job"
+    assert summary["experiment_slug"] == "demo-job"
+    assert summary["run_id"] == "20260620-153000"
+    assert summary["pier_job_id"] == "demo-job/20260620-153000"
+    assert pier_job_identity(job_dir) == {
+        "job_name": "demo-job",
+        "run_id": "20260620-153000",
+        "id": "demo-job/20260620-153000",
+    }
+
+
 def test_resolve_pier_trial_events(tmp_path: Path):
     job_dir = _make_pier_job(tmp_path / "jobs" / "demo-job")
 
@@ -294,8 +313,52 @@ def test_cli_analyze_reports_pier_harness_error_when_logs_are_absent(tmp_path: P
     assert "unavailable" in result.output
 
 
+def test_cli_list_displays_pier_run_selectors(tmp_path: Path):
+    job_dir = _make_pier_job(tmp_path / "jobs" / "demo-job" / "20260620-153000")
+    write_pier_run_manifest(job_dir, job_name="demo-job", run_id="20260620-153000")
+    runner = CliRunner()
+
+    result = runner.invoke(app, ["list", "--root", str(tmp_path)])
+
+    assert result.exit_code == 0, result.output
+    assert "Pier runs" in result.output
+    assert "selector" in result.output
+    assert "demo-job/20260620-153000" in result.output
+    assert "demo-job" in result.output
+    assert "20260620-153000" in result.output
+    assert "No runs yet" not in result.output
+
+
+def test_cli_show_accepts_pier_job_run_selector(tmp_path: Path):
+    job_dir = _make_pier_job(tmp_path / "jobs" / "demo-job" / "20260620-153000")
+    write_pier_run_manifest(job_dir, job_name="demo-job", run_id="20260620-153000")
+    runner = CliRunner()
+
+    result = runner.invoke(
+        app,
+        ["show", "demo-job/20260620-153000", "--root", str(tmp_path)],
+    )
+
+    assert result.exit_code == 0, result.output
+    assert "demo-job" in result.output
+    assert "20260620-153000" in result.output
+    assert "summary.md" in result.output
+
+
+def test_cli_show_missing_run_points_to_list(tmp_path: Path):
+    runner = CliRunner()
+
+    result = runner.invoke(app, ["show", "missing", "--root", str(tmp_path)])
+
+    assert result.exit_code == 1
+    assert "Run not found" in result.output
+    assert "copilot-experiments list" in result.output
+    assert "job-name/run-id" in result.output
+
+
 def test_write_pier_summary_and_index(tmp_path: Path):
-    job_dir = _make_pier_job(tmp_path / "jobs" / "demo-job")
+    job_dir = _make_pier_job(tmp_path / "jobs" / "demo-job" / "20260620-153000")
+    write_pier_run_manifest(job_dir, job_name="demo-job", run_id="20260620-153000")
 
     summary = write_pier_summary(job_dir)
 
@@ -306,12 +369,18 @@ def test_write_pier_summary_and_index(tmp_path: Path):
     conn = connect(tmp_path / "results" / "index.db")
     try:
         index_pier_job_dir(conn, job_dir)
-        job = conn.execute("SELECT * FROM pier_jobs WHERE job_name='demo-job'").fetchone()
-        trial = conn.execute("SELECT * FROM pier_trials WHERE job_name='demo-job'").fetchone()
+        job = conn.execute("SELECT * FROM pier_jobs WHERE id='demo-job/20260620-153000'").fetchone()
+        trial = conn.execute(
+            "SELECT * FROM pier_trials WHERE job_id='demo-job/20260620-153000'"
+        ).fetchone()
     finally:
         conn.close()
 
+    assert job["job_name"] == "demo-job"
+    assert job["run_id"] == "20260620-153000"
     assert job["success_rate"] == 1.0
+    assert trial["job_name"] == "demo-job"
+    assert trial["run_id"] == "20260620-153000"
     assert trial["trial_name"] == "copilot-cli__textstats__1"
     assert trial["success"] == 1
     assert trial["total_tokens"] == 15.0
diff --git a/tests/test_storage.py b/tests/test_storage.py
index 62f12bc..3cb3c6a 100644
--- a/tests/test_storage.py
+++ b/tests/test_storage.py
@@ -50,16 +50,32 @@ def test_iter_runs_skips_incomplete(tmp_path: Path):
 
 def test_pier_job_helpers(tmp_path: Path):
     jobs = tmp_path / "jobs"
-    good = jobs / "20260102T000000Z_beta"
+    good = jobs / "smoke" / "20260102-000000"
     good.mkdir(parents=True)
     (good / "config.json").write_text("{}", encoding="utf-8")
     (good / "result.json").write_text("{}", encoding="utf-8")
-    incomplete = jobs / "20260103T000000Z_incomplete"
+    latest = jobs / "smoke" / "20260103-000000"
+    latest.mkdir()
+    (latest / "config.json").write_text("{}", encoding="utf-8")
+    (latest / "result.json").write_text("{}", encoding="utf-8")
+    incomplete = jobs / "smoke" / "20260104-000000"
     incomplete.mkdir()
+    legacy = jobs / "legacy-job"
+    legacy.mkdir()
+    (legacy / "config.json").write_text("{}", encoding="utf-8")
+    (legacy / "result.json").write_text("{}", encoding="utf-8")
+    legacy_trial = legacy / "copilot-cli__task__1"
+    legacy_trial.mkdir()
+    (legacy_trial / "config.json").write_text("{}", encoding="utf-8")
+    (legacy_trial / "result.json").write_text("{}", encoding="utf-8")
 
     layout = Layout(tmp_path)
 
-    assert layout.iter_pier_jobs() == [good]
-    assert layout.latest_pier_job() == good
+    assert layout.iter_pier_jobs() == [legacy, good, latest]
+    assert layout.pier_job_key(good) == "smoke/20260102-000000"
+    assert layout.latest_pier_job() == latest
+    assert layout.find_pier_job("smoke") == latest
+    assert layout.find_pier_job("smoke/20260102") == good
     assert layout.find_pier_job("20260102") == good
+    assert layout.find_pier_job("legacy-job") == legacy
     assert layout.find_pier_job("missing") is None

From 50e47d9e380904f0f5b5404300f8b5e64a8b72a9 Mon Sep 17 00:00:00 2001
From: Dominique Broeglin <dominique.broeglin@microsoft.com>
Date: Sun, 28 Jun 2026 03:46:27 +0200
Subject: [PATCH 2/3] Simplify CLI around Pier jobs

Remove the legacy native runner, dry-run command, SQLite index, and legacy result layout. Standardize storage, summaries, docs, templates, and the structure canvas around Pier jobs, agents, tasks, trials, and runs.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .apm/instructions/development.instructions.md |   15 +-
 .apm/prompts/library-change.prompt.md         |    6 +-
 .apm/skills/developing-the-library/SKILL.md   |   37 +-
 .../extension.mjs                             |   61 +-
 AGENTS.md                                     |   46 +-
 README.md                                     |   24 +-
 docs/adr/0003-sqlite-derived-index.md         |    2 +-
 .../0019-use-nested-pier-run-directories.md   |   10 +-
 docs/adr/0020-remove-legacy-native-harness.md |   44 +
 docs/adr/README.md                            |    3 +-
 docs/analysis.md                              |   17 +-
 docs/architecture.md                          |   26 +-
 docs/authoring-experiments.md                 |   24 +-
 docs/collecting-run-data.md                   |   21 +-
 docs/deepswe.md                               |    5 +-
 docs/results-format.md                        |   47 +-
 examples/task_suite/README.md                 |   11 +-
 examples/tracer_bullet/README.md              |   14 +-
 sandbox/README.md                             |    5 +-
 src/copilot_experiments/__init__.py           |   49 +-
 src/copilot_experiments/_util.py              |    2 +-
 src/copilot_experiments/auth.py               |   36 +-
 src/copilot_experiments/cli.py                | 1052 ++++++-----------
 src/copilot_experiments/index.py              |  356 ------
 src/copilot_experiments/invoker.py            |  482 --------
 src/copilot_experiments/models.py             |  358 +-----
 src/copilot_experiments/pier_backend.py       |   23 +-
 src/copilot_experiments/pier_results.py       |   60 +-
 src/copilot_experiments/report.py             |  246 +---
 src/copilot_experiments/runner.py             |  591 ---------
 src/copilot_experiments/storage.py            |  184 +--
 .../instructions/experiments.instructions.md  |    3 +-
 .../.apm/prompts/new-experiment.prompt.md     |    2 +-
 .../.apm/skills/analyzing-results/SKILL.md    |   17 +-
 .../skills/authoring-experiments/SKILL.md     |    2 +-
 .../templates/experiment_repo/AGENTS.md.tmpl  |   12 +-
 .../templates/experiment_repo/README.md.tmpl  |   17 +-
 src/copilot_experiments/workspace.py          |  116 --
 tests/conftest.py                             |   68 --
 tests/test_auth.py                            |   25 +-
 tests/test_economics.py                       |   71 +-
 tests/test_index.py                           |   61 -
 tests/test_invoker.py                         |  216 ----
 tests/test_models.py                          |   60 -
 tests/test_pier_backend.py                    |   19 +-
 tests/test_pier_results.py                    |  142 ++-
 tests/test_runner.py                          |  301 -----
 tests/test_storage.py                         |   86 +-
 tests/test_workspace.py                       |  151 ---
 49 files changed, 864 insertions(+), 4362 deletions(-)
 create mode 100644 docs/adr/0020-remove-legacy-native-harness.md
 delete mode 100644 src/copilot_experiments/index.py
 delete mode 100644 src/copilot_experiments/invoker.py
 delete mode 100644 src/copilot_experiments/runner.py
 delete mode 100644 src/copilot_experiments/workspace.py
 delete mode 100644 tests/conftest.py
 delete mode 100644 tests/test_index.py
 delete mode 100644 tests/test_invoker.py
 delete mode 100644 tests/test_models.py
 delete mode 100644 tests/test_runner.py
 delete mode 100644 tests/test_workspace.py

diff --git a/.apm/instructions/development.instructions.md b/.apm/instructions/development.instructions.md
index c5f5f25..1e8bab8 100644
--- a/.apm/instructions/development.instructions.md
+++ b/.apm/instructions/development.instructions.md
@@ -16,11 +16,10 @@ experiment repo — experiment-authoring context is a template under
   formatting, and CI/pre-commit enforce it.
 - Maintain good test coverage for every behavior change with focused offline tests, not just broad
   smoke coverage.
-- Keep tests offline: exercise the runner with `MockInvoker` (and a `solver` for the success
-  path) plus a temp `--root`. Never invoke the real `copilot` binary or the network in tests.
-- Preserve invariants: filesystem is source of truth (`reindex` rebuilds `index.db`); secrets are
-  redacted on disk (`Variant.stored()` / `ProviderConfig.redacted()`); `--dry-run` is ephemeral —
-  it runs in a temp dir, validates each stage, and persists nothing (`dry_run_experiment`).
+- Keep tests offline: use Pier config/job-output fixtures and mocks plus a temp `--root`. Never
+  invoke the real `copilot` binary, Docker, or the network in tests.
+- Preserve invariants: `jobs/<job>/<run-id>/` is the filesystem source of truth; summaries are
+  derived; secrets are injected at run time and redacted from persisted configs.
 
 ## When changing public behavior
 - Update `docs/` (architecture, authoring, results-format, BYOK) and `README.md`.
@@ -28,6 +27,6 @@ experiment repo — experiment-authoring context is a template under
 - Bump `__version__` in `src/copilot_experiments/__init__.py` and `version` in `pyproject.toml`.
 
 ## Module responsibilities
-`models` (schemas) · `invoker` (build/run copilot) · `workspace` (provision + diff) ·
-`sessionlog` (parse events → metrics) · `runner` (orchestrate) · `storage` (layout) ·
-`index` (sqlite) · `report` (summaries) · `scaffold` (init) · `cli` (Typer).
+`models` (analysis/economics schemas) · `pier_backend` (Pier config/run integration) ·
+`pier_results` (job/run/agent/task summaries) · `sessionlog` (parse events → metrics) ·
+`storage` (Pier jobs layout) · `report` (summaries) · `scaffold` (init) · `cli` (Typer).
diff --git a/.apm/prompts/library-change.prompt.md b/.apm/prompts/library-change.prompt.md
index b2f7594..3183b16 100644
--- a/.apm/prompts/library-change.prompt.md
+++ b/.apm/prompts/library-change.prompt.md
@@ -9,9 +9,9 @@ Make a change to the `copilot_experiments` package (the harness, not an experime
 Steps:
 1. Identify the right module (see `AGENTS.md` repository map and the
    `developing-the-library` skill).
-2. Implement the change, keeping the architecture invariants intact (filesystem is source of
-   truth; secrets redacted on disk; tests/dry-runs stay offline).
-3. Add or update tests in `tests/` using `MockInvoker` and a temp `--root`.
+2. Implement the change, keeping the architecture invariants intact (`jobs/<job>/<run-id>/` is the
+   filesystem source of truth; secrets are redacted on disk; tests stay offline).
+3. Add or update tests in `tests/` using fixtures/mocks and a temp `--root`.
 4. Run `uv run ruff check --fix .`, `uv run ruff format .`, `uv run ruff check .`, and
    `uv run pytest -q`; fix until all are green.
 5. Update `docs/`, `README.md`, and the `templates/experiment_repo/` template if public
diff --git a/.apm/skills/developing-the-library/SKILL.md b/.apm/skills/developing-the-library/SKILL.md
index 2241591..53f9cbf 100644
--- a/.apm/skills/developing-the-library/SKILL.md
+++ b/.apm/skills/developing-the-library/SKILL.md
@@ -2,39 +2,38 @@
 name: developing-the-library
 description: >-
   Use when modifying the copilot-experiments library or CLI itself — adding or
-  changing modules (models, invoker, runner, sessionlog, storage, index, report,
-  scaffold, cli), writing tests, or updating the scaffolded experiment-repo
-  template. Not for authoring experiments.
+  changing modules (models, pier_backend, pier_results, sessionlog, storage,
+  report, scaffold, cli), writing tests, or updating the scaffolded
+  experiment-repo template. Not for authoring experiments.
 ---
 
 # Developing the copilot-experiments library
 
 ## Mental model
-A **run** executes an `Experiment` (a `Task` + a list of `Variant`s). For each variant, for each
-trial, the runner: provisions a workspace → invokes Copilot → copies & parses the session log →
-captures a workspace diff → runs `verify` → writes artifacts → updates the SQLite index.
+A **run** executes a Pier `JobConfig`. For each agent/task/attempt trial, Pier provisions the
+environment, invokes the installed agent, runs the verifier, and downloads logs/artifacts.
+`copilot-experiments` contributes the `copilot-cli` Pier agent and derives summaries/analysis from
+the resulting `jobs/<job>/<run-id>/` tree.
 
 ```
-Experiment ─┬─ Task (prompt, fixture/repo, setup, verify)
-            └─ Variant[] (model, effort, agent, mode, provider/BYOK, env, trials)
-run_experiment() → results/<exp>/<run-id>/ + results/index.db
+Pier JobConfig ─┬─ tasks/datasets
+                └─ agents[] (copilot-cli model, effort, mode, kwargs)
+copilot-experiments run → jobs/<job-name>/<run-id>/
 ```
 
 ## Where to make a change
-- New experiment-definition field → `models.py` (+ thread through `invoker.build_args`/`build_env`
-  if it affects the command, + `index.py` columns if you want it queryable).
+- New Pier config/run behavior → `pier_backend.py`.
 - New CLI command/flag → `cli.py` (Typer). `B008` is ignored project-wide for Typer defaults.
-- New metric → `sessionlog.parse_metrics` (+ `Metrics` in `models.py`, + `index.py`, + `report.py`).
-- New result artifact → write it in `runner._run_trial`, document it in `storage.py`'s docstring
-  and `docs/results-format.md`.
+- New metric → `sessionlog.parse_metrics` (+ `Metrics` in `models.py`, + `pier_results.py` /
+  `report.py` if summaries should expose it).
+- New result artifact → emit or collect it through the Pier agent/backend, then document it in
+  `docs/results-format.md`.
 - Experiment-authoring change → edit `templates/experiment_repo/` (it is package data).
 
 ## Testing recipe
 - Unit-test pure functions directly (models, sessionlog, storage, scaffold).
-- For the runner, call `run_experiment(exp, root=tmp, invoker=MockInvoker())` for a persisted
-  mock path, `run_experiment(exp, root=tmp, invoker=MockInvoker(solver=...))` for a success
-  path, and `dry_run_experiment(exp, root=tmp)` to exercise the ephemeral validating dry-run
-  (returns a `DryRunReport`, persists nothing).
+- Use Pier config and job-output fixtures for CLI/storage/result tests; mock backend/auth preflights
+  instead of invoking Docker or Copilot.
 - Build synthetic `events.jsonl` dicts to test `parse_metrics` without any Copilot run.
 - Add or update focused offline tests for each behavior change. Good coverage is expected,
   especially around Pier config loading, result adaptation, CLI behavior, and session parsing.
@@ -47,5 +46,5 @@ uv run ruff check .
 uv run pytest -q
 # optional end-to-end smoke test:
 uv run copilot-experiments init sandbox/demo --force
-uv run copilot-experiments run --root sandbox/demo --dry-run
+uv run copilot-experiments validate --root sandbox/demo
 ```
diff --git a/.github/extensions/experiment-repository-structure/extension.mjs b/.github/extensions/experiment-repository-structure/extension.mjs
index d942782..cb72250 100644
--- a/.github/extensions/experiment-repository-structure/extension.mjs
+++ b/.github/extensions/experiment-repository-structure/extension.mjs
@@ -14,7 +14,7 @@ const structure = [
         owner: "human + harness",
         source: "The git checkout that contains experiment definitions and generated outputs.",
         why: "Separates experiment authoring from the copilot-experiments harness repository.",
-        commands: ["copilot-experiments list", "copilot-experiments run --dry-run"],
+        commands: ["copilot-experiments list", "copilot-experiments validate"],
     },
     {
         id: "experiments",
@@ -26,7 +26,7 @@ const structure = [
         owner: "experiment author",
         source: "Pier JobConfig YAML files.",
         why: "Defines what to run: tasks, agents, model settings, attempts, concurrency, and job_name.",
-        commands: ["copilot-experiments run --dry-run", "copilot-experiments run [job-name]"],
+        commands: ["copilot-experiments validate", "copilot-experiments run [job-name]"],
     },
     {
         id: "job-yaml",
@@ -50,7 +50,7 @@ const structure = [
         owner: "experiment author",
         source: "Harbor/Pier task directories or imported task corpora.",
         why: "Keeps task instructions, environment setup, and verifiers close to the experiment repo.",
-        commands: ["copilot-experiments deepswe-import <source>", "copilot-experiments run --dry-run"],
+        commands: ["copilot-experiments deepswe-import <source>", "copilot-experiments validate"],
     },
     {
         id: "task-dir",
@@ -62,7 +62,7 @@ const structure = [
         owner: "experiment author",
         source: "One task's prompt, environment, and verifier.",
         why: "A Pier job can point to individual tasks or datasets of many tasks.",
-        commands: ["copilot-experiments run --dry-run"],
+        commands: ["copilot-experiments validate"],
     },
     {
         id: "task-instruction",
@@ -122,7 +122,7 @@ const structure = [
         owner: "Pier + harness",
         source: "Generated run outputs. This is now the primary execution tree.",
         why: "Keeps measured executions out of git while preserving all data needed to inspect a run.",
-        commands: ["copilot-experiments list", "copilot-experiments reindex"],
+        commands: ["copilot-experiments list"],
     },
     {
         id: "job-group",
@@ -162,7 +162,7 @@ const structure = [
         owner: "copilot-experiments",
         source: "Small manifest with job_name, run_id, and job/run id.",
         why: "Pier's config sees the concrete run id as job_name; this manifest preserves the stable job identity.",
-        commands: ["copilot-experiments reindex"],
+        commands: [],
     },
     {
         id: "run-config",
@@ -185,7 +185,7 @@ const structure = [
         badge: "Pier",
         owner: "Pier",
         source: "Job-level status, timings, and aggregate Pier stats.",
-        why: "Primary job status signal for show/list/reindex.",
+        why: "Primary job status signal for show and list.",
         commands: ["copilot-experiments show <job-name>/<run-id>"],
     },
     {
@@ -320,45 +320,9 @@ const structure = [
         badge: "derived",
         owner: "copilot-experiments",
         source: "Generated from Pier result files and Copilot-native logs.",
-        why: "Gives the familiar variant/task aggregate shape for show and reports.",
+        why: "Gives the agent/task aggregate shape for show and reports.",
         commands: ["copilot-experiments show <job-name>/<run-id>"],
     },
-    {
-        id: "results",
-        parent: "repo",
-        label: "results/",
-        path: "results/",
-        kind: "derived",
-        badge: "derived",
-        owner: "copilot-experiments",
-        source: "Derived index plus legacy Python experiment runs.",
-        why: "The SQLite index is rebuildable. Legacy run data remains readable during migration.",
-        commands: ["copilot-experiments reindex"],
-    },
-    {
-        id: "index-db",
-        parent: "results",
-        label: "index.db",
-        path: "results/index.db",
-        kind: "derived",
-        badge: "cache",
-        owner: "copilot-experiments",
-        source: "SQLite cache derived from jobs/ and legacy results/.",
-        why: "Speeds up cross-run queries; never the source of truth.",
-        commands: ["copilot-experiments reindex"],
-    },
-    {
-        id: "legacy-results",
-        parent: "results",
-        label: "<experiment>/<run-id>/...",
-        path: "results/<experiment>/<run-id>/",
-        kind: "legacy",
-        badge: "legacy",
-        owner: "legacy harness",
-        source: "Older Python Experiment/Task/Variant runs.",
-        why: "Kept for migration and historical data; new Pier runs use jobs/.",
-        commands: ["copilot-experiments show <run-id>", "copilot-experiments analyze <run-id>"],
-    },
     {
         id: "guidance",
         parent: "repo",
@@ -379,7 +343,7 @@ const flow = [
     ["Run", "copilot-experiments run"],
     ["Concrete output", "jobs/<job-name>/<run-id>/"],
     ["Inspect/analyze", "show | inspect | analyze <job-name>/<run-id>"],
-    ["Derived cache", "results/index.db"],
+    ["Summarize", "summary.json / summary.md"],
 ];
 
 function htmlEscape(value) {
@@ -662,7 +626,6 @@ button {
 .analysis { background: var(--true-color-red, #cf222e); }
 .derived { background: var(--true-color-yellow, #9a6700); }
 .guidance { background: var(--text-color-muted, #57606a); }
-.legacy { background: #8c959f; }
 .root { background: var(--text-color-default, #1f2328); }
 @media (max-width: 980px) {
     .flow,
@@ -707,7 +670,7 @@ button {
 <script>
 const STRUCTURE = ${data};
 const FLOW = ${flowData};
-const KINDS = ["all", "source", "task", "run", "trial", "analysis", "derived", "guidance", "legacy"];
+const KINDS = ["all", "source", "task", "run", "trial", "analysis", "derived", "guidance"];
 let selectedKind = "all";
 let selectedId = "run-dir";
 let query = "";
@@ -747,7 +710,7 @@ function renderFlow() {
 }
 
 function renderLegend() {
-    const kinds = ["source", "task", "run", "trial", "analysis", "derived", "guidance", "legacy"];
+    const kinds = ["source", "task", "run", "trial", "analysis", "derived", "guidance"];
     document.getElementById("legend").innerHTML = kinds.map(function(kind) {
         return '<span><i class="dot ' + kind + '"></i>' + esc(kind) + '</span>';
     }).join("");
@@ -863,7 +826,7 @@ await joinSession({
                     handler: async () => ({
                         layout: "Pier runs live at jobs/<job-name>/<run-id>/.",
                         selector: "Use copilot-experiments list, then pass job-name/run-id to show, inspect, or analyze.",
-                        sourceOfTruth: "jobs/ and legacy results/ on disk; results/index.db is derived.",
+                        sourceOfTruth: "jobs/<job-name>/<run-id>/ on disk; summaries are derived.",
                         nodes: structure.length,
                     }),
                 },
diff --git a/AGENTS.md b/AGENTS.md
index bf5ab62..4206d40 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -17,47 +17,36 @@ plus a [Typer](https://typer.tiangolo.com/) CLI. It is developed with [`uv`](htt
 
 ## Repository map
 - `src/copilot_experiments/` — the package.
-  - `models.py` — pydantic models: `Experiment`, `Task`, `Variant`, `ProviderConfig`, and the
-    result objects (`ExperimentRun` → `VariantResult` → `TaskResult` → `TrialResult`, `Metrics`).
-    An experiment is `Tasks × Variants × Trials`: `task=` is single-task sugar, `tasks=[...]` a
-    suite (`Experiment.iter_tasks()` normalises both to `(task_slug, Task)` pairs).
-  - `invoker.py` — builds and runs the `copilot` command. `CopilotInvoker` shells out to the
-    real CLI; `MockInvoker` simulates a run (used by dry-runs and the test suite).
-  - `workspace.py` — provisions an isolated per-trial workspace (copy a fixture or `git clone`),
-    commits a git baseline, and captures a diff of Copilot's changes.
+  - `models.py` — pydantic models for Copilot session metrics, analysis, and token economics.
   - `sessionlog.py` — locates and parses Copilot's `events.jsonl` into `Metrics`; `extract_economics()`
     pulls token-type counts + AIU cost from `session.shutdown`.
   - `pricing.py` — AIU↔token cost math: documented per-token-type `costPerBatch` defaults, live-rate
     reading from `session.compaction_complete`, and the per-type AIU decomposition.
   - `analysis.py` — derives a rendering-agnostic `SessionAnalysis` (tools, turns, tokens, economics) from session events.
   - `render.py` — renders a `SessionAnalysis` to the terminal with Rich (backs the `analyze` command).
-  - `runner.py` — orchestration: variants × tasks × trials → result artifacts + index. Also
-    `dry_run_experiment()` (ephemeral, validating plumbing check that persists nothing).
-  - `storage.py` — the `results/` filesystem `Layout` and run discovery.
-  - `index.py` — the SQLite index (`results/index.db`) derived from the filesystem.
-  - `report.py` — aggregation, `summary.json`, and `summary.md`.
+  - `pier_backend.py` — discovers and normalizes Pier `JobConfig`s, injects auth, and runs Pier jobs.
+  - `pier_results.py` — reads Pier outputs and derives job/run/agent/task/trial summaries.
+  - `storage.py` — Pier `jobs/<job>/<run-id>/` layout and run discovery.
+  - `report.py` — renders `summary.json` and `summary.md`.
   - `scaffold.py` — `init` logic: render `templates/experiment_repo/` into a new repo.
-  - `cli.py` — the Typer app (`init`, `run`, `list`, `show`, `analyze`, `inspect`, `reindex`).
+  - `cli.py` — the Typer app (`init`, `deepswe-import`, `validate`, `run`, `list`, `show`,
+    `analyze`, `inspect`).
   - `templates/experiment_repo/` — package-data template for scaffolded experiment repos.
 - `examples/tracer_bullet/` — a committed, runnable multi-turn example experiment (textstats).
 - `examples/task_suite/` — a committed multi-task example (strtools + csvtools) exercising the
   task axis and its mean-success / resolved@k suite-coverage metrics.
-- `sandbox/` — local scratch space for exercising the lib/CLI (its `results/` are gitignored).
-- `tests/` — pytest suite (uses `MockInvoker`; **never** requires a real `copilot` or network).
+- `sandbox/` — local scratch space for exercising the lib/CLI (generated outputs are gitignored).
+- `tests/` — pytest suite (uses fixtures/mocks; **never** requires a real `copilot` or network).
 - `docs/` — architecture, authoring guide, analysis (`analyze`), results-format reference,
   BYOK/local-models guide, and `docs/adr/` (architecture decision records).
 
 ## Architecture invariants (keep these true)
-- **The filesystem is the source of truth.** `results/index.db` is a derived, rebuildable
-  cache — `reindex` must always be able to recreate it by scanning `results/`.
-- **Secrets never hit disk.** Store variants via `Variant.stored()` / `ProviderConfig.redacted()`,
-  which mask `api_key` / `bearer_token`.
-- **Tests stay offline.** Anything exercising the runner uses `MockInvoker` (optionally with a
-  `solver` callback) and a temp `--root`. Do not call the real Copilot CLI in tests.
-- **Dry-runs are ephemeral.** `dry_run_experiment()` runs the whole pipeline with `MockInvoker`
-  in a `tempfile.mkdtemp()` dir (results + session state both redirected there), validates each
-  stage produced its artifact, then deletes the temp dir. Nothing is persisted under the repo;
-  `run_experiment()` has no `dry_run` parameter.
+- **The filesystem is the source of truth.** Pier runs live under `jobs/<job>/<run-id>/`; summaries
+  are derived from that tree.
+- **Secrets never hit disk.** Auth is injected into Pier jobs at run time and redacted from persisted
+  configs.
+- **Tests stay offline.** Use Pier config/job-output fixtures and mocks. Do not call the real
+  Copilot CLI, Docker, or the network in tests.
 - **Templates are data.** Files under `templates/experiment_repo/` are shipped as package data;
   `.tmpl` files are rendered with `{{placeholder}}` substitution and lose the suffix.
 
@@ -66,7 +55,7 @@ plus a [Typer](https://typer.tiangolo.com/) CLI. It is developed with [`uv`](htt
 - `--model` / `--effort` / `--agent` / `--mode`, `--output-format json`, `--session-id <uuid>`,
   `--log-dir`, `-C <dir>`.
 - Session logs land at `~/.copilot/session-state/<session-id>/events.jsonl` — the metrics source.
-- **BYOK** is env-driven (`COPILOT_PROVIDER_*`); a variant is just flags + env.
+- **BYOK** is env-driven (`COPILOT_PROVIDER_*`) through the `copilot-cli` Pier agent kwargs/env.
 
 ## Workflow
 ```bash
@@ -77,7 +66,8 @@ uv run ruff check .                  # verify lint
 uv run pytest -q                     # test
 # End-to-end smoke test against the sandbox:
 uv run copilot-experiments init sandbox/demo --force
-uv run copilot-experiments run --root sandbox/demo --dry-run
+uv run copilot-experiments validate --root sandbox/demo
+uv run copilot-experiments run --root sandbox/demo
 uv run copilot-experiments show --last --root sandbox/demo
 ```
 For local CLI testing point `--root` at a scaffolded dir in `sandbox/` rather than `uv sync`-ing
diff --git a/README.md b/README.md
index e602e70..bc2949d 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,6 @@ flowchart LR
     P --> O["jobs/<job>/<run-id>/\nPier result.json + trials"]
     S --> C["Copilot-native analysis\nAIU, tokens, tools, turns"]
     O --> R["summary.json / summary.md\nshow / inspect / analyze"]
-    O --> I["results/index.db\nderived SQLite index"]
 ```
 
 - **Tasks** are Harbor/Pier task directories: `task.toml`, `instruction.md`, `environment/`,
@@ -41,8 +40,8 @@ uv run copilot-experiments init my-experiments
 cd my-experiments
 uv sync
 
-# validate Pier job configs without starting a sandbox
-uv run copilot-experiments run --dry-run
+# validate Pier job configs, paths, auth, and backend setup
+uv run copilot-experiments validate
 
 # run for real through Pier
 uv run copilot-experiments run
@@ -60,7 +59,7 @@ export COPILOT_EXPERIMENTS_REPO=/path/to/github-copilot-lab
 
 uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments init my-experiments
 cd my-experiments
-uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments run --dry-run
+uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments validate
 uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments list
 uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments show --last
 ```
@@ -88,7 +87,7 @@ selectors use `job-name/run-id`; passing just `job-name` selects that job's late
 ## Bundled examples
 
 ```bash
-uv run copilot-experiments run --root examples/tracer_bullet --dry-run
+uv run copilot-experiments validate --root examples/tracer_bullet
 uv run copilot-experiments run --root examples/tracer_bullet
 uv run copilot-experiments analyze --root examples/tracer_bullet --last
 ```
@@ -102,14 +101,13 @@ uv run copilot-experiments analyze --root examples/tracer_bullet --last
 | --- | --- |
 | `init <dir>` | Scaffold a standalone Pier experiment repository. |
 | `deepswe-import <path>` | Generate a Pier job config for a cloned DeepSWE checkout, `tasks/` corpus, or single task. |
-| `run [name]` | Discover Pier job configs in `experiments/` and run them. Each run writes to a fresh `jobs/<job>/<run-id>/` directory. Falls back to legacy Python experiments when no Pier configs exist. |
-| `run --dry-run` | Validate Pier job configs, or run the legacy ephemeral mock dry-run for legacy experiments. |
+| `validate [name]` | Validate Pier job configs, referenced task/dataset paths, auth, and backend setup without creating a run. |
+| `run [name]` | Discover Pier job configs in `experiments/` and run them. Each run writes to a fresh `jobs/<job>/<run-id>/` directory. |
 | `run --resume` | Resume an existing Pier job directory and skip already-completed matching trials. |
-| `list` | List Pier job configs, legacy experiments, and copyable run selectors. |
-| `show <selector>` / `show --last` | Print a summary for a Pier run (`job` or `job/run`) or legacy run id. |
-| `analyze <selector>` / `analyze --last` / `analyze --file <events.jsonl>` | Render a rich overview of a native Copilot session log. |
-| `inspect <selector>` | Drill into stored trials and status for a Pier run (`job` or `job/run`) or legacy run id. |
-| `reindex` | Rebuild the derived SQLite index from `jobs/` and legacy `results/`. |
+| `list` | List Pier job configs and copyable run selectors. |
+| `show <selector>` / `show --last` | Print a summary for a Pier run (`job` or `job/run`). |
+| `inspect <selector>` | Drill into stored trials by `--agent`, `--task`, and `--trial`. |
+| `analyze <selector>` / `analyze --last` / `analyze --file <events.jsonl>` | Render a rich overview of a selected Copilot session log. |
 
 ## Documentation
 
@@ -117,7 +115,7 @@ uv run copilot-experiments analyze --root examples/tracer_bullet --last
 - [`docs/authoring-experiments.md`](docs/authoring-experiments.md) - task and job authoring.
 - [`docs/deepswe.md`](docs/deepswe.md) - importing and running DeepSWE tasks through Pier.
 - [`docs/collecting-run-data.md`](docs/collecting-run-data.md) - everything to collect around a Copilot CLI run, including native `events.jsonl`, Pier artifacts, ATIF, and OTel.
-- [`docs/results-format.md`](docs/results-format.md) - `jobs/` layout and derived index.
+- [`docs/results-format.md`](docs/results-format.md) - Pier `jobs/` layout and derived summaries.
 - [`docs/analysis.md`](docs/analysis.md) - native Copilot session analysis.
 - [`docs/byok-and-local-models.md`](docs/byok-and-local-models.md) - provider env for Copilot CLI.
 - [`docs/adr/`](docs/adr) - architecture decision records.
diff --git a/docs/adr/0003-sqlite-derived-index.md b/docs/adr/0003-sqlite-derived-index.md
index f0749bb..a84c6a6 100644
--- a/docs/adr/0003-sqlite-derived-index.md
+++ b/docs/adr/0003-sqlite-derived-index.md
@@ -1,6 +1,6 @@
 # 0003. A derived SQLite index for cross-run queries
 
-- **Status:** Accepted
+- **Status:** Superseded by ADR-0020
 - **Date:** 2026-06-14
 
 ## Context
diff --git a/docs/adr/0019-use-nested-pier-run-directories.md b/docs/adr/0019-use-nested-pier-run-directories.md
index 1f49d28..745cedc 100644
--- a/docs/adr/0019-use-nested-pier-run-directories.md
+++ b/docs/adr/0019-use-nested-pier-run-directories.md
@@ -15,8 +15,8 @@ That mixed stable identity and concrete execution identity in one string. It als
 lookup unclear: users could pass `--last`, but it was not obvious how to discover a run id, how to
 select an earlier run, or whether a suffixed directory was a new job or a rerun of the same job.
 
-The filesystem remains the source of truth, and `results/index.db` remains a derived cache. Existing
-flat Pier job directories must remain readable during migration.
+The filesystem remains the source of truth. Earlier plans kept `results/index.db` as a derived cache
+and retained flat Pier job directories during migration; ADR-0020 supersedes that compatibility path.
 
 ## Decision
 
@@ -34,7 +34,7 @@ The CLI will expose copyable selectors through `copilot-experiments list`:
 - `job-name` selects the latest run for that Pier job.
 - `--last` selects the most recent stored run overall.
 
-Legacy flat Pier jobs at `jobs/<job-name>/` remain discoverable and resumable.
+Only nested Pier jobs with `copilot-experiments-run.json` are discoverable and resumable.
 
 ## Consequences
 
@@ -47,5 +47,5 @@ The harness owns a small manifest file in each new Pier run directory because Pi
 mistaking legacy flat job trial directories for nested runs; nested child directories under a legacy
 flat job are treated as runs only when they contain the harness manifest.
 
-Older flat jobs remain supported, but new documentation and generated experiment repos should teach
-the nested layout and `list`-driven selector workflow.
+Older flat jobs are not supported after the Pier-only cleanup. Documentation and generated
+experiment repos teach the nested layout and `list`-driven selector workflow.
diff --git a/docs/adr/0020-remove-legacy-native-harness.md b/docs/adr/0020-remove-legacy-native-harness.md
new file mode 100644
index 0000000..a485d99
--- /dev/null
+++ b/docs/adr/0020-remove-legacy-native-harness.md
@@ -0,0 +1,44 @@
+# 0020. Remove the legacy native harness
+
+- **Status:** Accepted
+- **Date:** 2026-06-28
+- **Deciders:** Project maintainers
+
+## Context
+
+The project had two overlapping execution models:
+
+- the original native Python harness, with `Experiment`, `Task`, `Variant`, `run_experiment()`,
+  mock/dry-run execution, `results/<experiment>/<run>/`, and a derived SQLite index; and
+- Pier jobs, with `JobConfig`, `agents:`, tasks/datasets, attempts, and `jobs/<job>/<run-id>/`.
+
+Keeping both models made the CLI hard to explain. Users configured Pier `agents:` but then had to
+look for "variants", use raw `--trial` selectors to find a particular agent result, and learn
+whether `run --dry-run` meant a mock execution or a Pier config-load check.
+
+## Decision
+
+`copilot-experiments` is Pier-only. We remove the native `Experiment`/`Task`/`Variant` runner,
+workspace/invoker abstractions, old `results/` layout, SQLite index, `reindex`, and `run --dry-run`.
+
+The active vocabulary is:
+
+- **Job config**: Pier YAML/JSON under `experiments/`.
+- **Job**: stable `job_name`.
+- **Run**: concrete execution at `jobs/<job-name>/<run-id>/`.
+- **Agent**: one Pier `agents:` entry and the comparison axis.
+- **Task**: one task or dataset-expanded task.
+- **Trial**: one attempt of an `(agent, task)` cell.
+
+The CLI remains flat but speaks this vocabulary: `validate`, `run`, `list`, `show`, `inspect`, and
+`analyze`. `validate` is a preflight, not a fake run: it loads Pier job configs, checks referenced
+paths, runs backend preflights, and checks Copilot auth without creating a job directory.
+
+## Consequences
+
+- Old native experiment definitions and old `results/` trees are no longer readable by active CLI
+  commands.
+- `jobs/<job-name>/<run-id>/` is the only persisted execution layout.
+- Cross-run discovery scans `jobs/` directly instead of using `results/index.db`.
+- Summaries aggregate by agent, task, and trial rather than adapting agents into variants.
+- Existing ADRs about the SQLite index and dry-run semantics are superseded for current behavior.
diff --git a/docs/adr/README.md b/docs/adr/README.md
index d2d933c..8791c59 100644
--- a/docs/adr/README.md
+++ b/docs/adr/README.md
@@ -19,7 +19,7 @@ We follow the lightweight format popularized by
 | --- | --- | --- |
 | [0001](0001-record-architecture-decisions.md) | Record architecture decisions | Accepted |
 | [0002](0002-filesystem-is-source-of-truth.md) | The filesystem is the source of truth | Accepted; amended by ADR-0015 |
-| [0003](0003-sqlite-derived-index.md) | A derived SQLite index for cross-run queries | Accepted |
+| [0003](0003-sqlite-derived-index.md) | A derived SQLite index for cross-run queries | Superseded by ADR-0020 |
 | [0004](0004-session-log-is-primary-data-source.md) | The Copilot session log is the primary data source | Accepted |
 | [0005](0005-mock-invoker-for-offline-tests.md) | A MockInvoker keeps the harness offline-testable | Superseded by ADR-0015 for Pier runs |
 | [0006](0006-separate-analysis-data-from-rendering.md) | Separate analysis data from its rendering | Accepted |
@@ -36,3 +36,4 @@ We follow the lightweight format popularized by
 | [0017](0017-import-deepswe-as-pier-dataset.md) | Import DeepSWE as a Pier dataset config | Accepted |
 | [0018](0018-adopt-pytest-cov-for-local-coverage-analysis.md) | Adopt pytest-cov for local coverage analysis | Accepted |
 | [0019](0019-use-nested-pier-run-directories.md) | Use nested Pier run directories | Accepted |
+| [0020](0020-remove-legacy-native-harness.md) | Remove the legacy native harness | Accepted |
diff --git a/docs/analysis.md b/docs/analysis.md
index 3e4b366..50d6d4f 100644
--- a/docs/analysis.md
+++ b/docs/analysis.md
@@ -4,12 +4,11 @@ After a Pier trial runs with the local `copilot-cli` agent, the job output keeps
 Copilot CLI **session log** (`agent/copilot-session/**/events.jsonl`). `copilot-experiments`
 derives two views from that raw log:
 
-- **Flat metrics** — counters used for `summary.json`, `show`, and the SQLite index.
+- **Flat metrics** — counters used for `summary.json` and `show`.
 - **`SessionAnalysis`** — a richer, structured overview of *what happened* in the session,
   rendered by `analyze`.
 
-Legacy Python runs also persist these views as per-trial `metrics.json` and `analysis.json`; Pier
-runs derive them from the canonical job artifacts on demand.
+Pier runs derive these views from the canonical job artifacts on demand.
 
 This page covers the second one and the `analyze` command that renders it.
 
@@ -23,17 +22,17 @@ This page covers the second one and the `analyze` command that renders it.
 ## The `analyze` command
 
 ```bash
-# Most recent Pier job (first trial by default)
-uv run copilot-experiments analyze --last
+# Most recent Pier run; add selectors when multiple trials match
+uv run copilot-experiments analyze --last --agent copilot-cli --trial 1
 
 # Discover copyable selectors
 uv run copilot-experiments list
 
 # A specific Pier job's latest run / trial
-uv run copilot-experiments analyze tracer-bullet-textstats --trial 1
+uv run copilot-experiments analyze tracer-bullet-textstats --agent copilot-cli --trial 1
 
 # A specific Pier run / trial
-uv run copilot-experiments analyze tracer-bullet-textstats/20260620-153000 --trial 1
+uv run copilot-experiments analyze tracer-bullet-textstats/20260620-153000 --agent copilot-cli --trial 1
 
 # Any events.jsonl on disk — a stored trial log, or a live session under
 # ~/.copilot/session-state/<id>/events.jsonl
@@ -104,8 +103,8 @@ Warnings, if any, are shown in a panel at the bottom.
 ## The `SessionAnalysis` model
 
 `analyze_events(events) -> SessionAnalysis` ([`analysis.py`](../src/copilot_experiments/analysis.py))
-produces plain pydantic data (no formatting), so the same object backs the CLI renderer, the
-legacy `analysis.json`, and any future consumer.
+produces plain pydantic data (no formatting), so the same object backs the CLI renderer and any
+future consumer.
 
 | Field | Meaning |
 | --- | --- |
diff --git a/docs/architecture.md b/docs/architecture.md
index 938b656..db8c1e1 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -1,8 +1,8 @@
 # Architecture
 
-`copilot-experiments` is now a thin integration layer around Pier. Pier provides the execution
+`copilot-experiments` is a thin integration layer around Pier. Pier provides the execution
 substrate; this package contributes a GitHub Copilot CLI installed agent, Copilot-native session
-analysis, a small CLI, templates, and derived reporting/indexing.
+analysis, a small CLI, templates, and derived reporting.
 
 ## Pipeline
 
@@ -24,7 +24,6 @@ flowchart TD
     OTEL --> ANALYSIS
     ATIF --> FALLBACK["ATIF fallback metrics"]
     OUT --> SUMMARY["pier_results.py\nsummary.json / summary.md"]
-    OUT --> INDEX["index.py\nresults/index.db"]
 ```
 
 ## Main modules
@@ -33,17 +32,15 @@ flowchart TD
 | --- | --- |
 | `pier_agents/copilot_cli.py` | Pier `BaseInstalledAgent` that installs and runs the real Copilot CLI, captures native session logs, and emits ATIF. |
 | `pier_backend.py` | Discovers and normalizes Pier `JobConfig` YAML/JSON, maps `name: copilot-cli` to the local import path, injects Copilot auth, and calls Pier's Python API. |
-| `pier_results.py` | Reads Pier job directories and adapts them into the existing summary/show/analyze shape. |
+| `pier_results.py` | Reads Pier job directories and derives job/run/agent/task/trial summaries. |
 | `sessionlog.py` | Parses native Copilot `events.jsonl` into flat metrics, including AIU/token economics. |
 | `analysis.py` / `render.py` | Builds and renders a richer session analysis view from native Copilot events. |
-| `storage.py` | Locates canonical Pier `jobs/` plus legacy `results/`. |
-| `index.py` | Rebuildable SQLite index over Pier jobs and legacy runs. |
+| `storage.py` | Locates canonical Pier `jobs/<job>/<run-id>/` directories. |
 | `scaffold.py` | Renders a Pier-first experiment repository template. |
-| `cli.py` | Typer CLI for init/run/list/show/inspect/analyze/reindex. |
+| `cli.py` | Typer CLI for init/deepswe-import/validate/run/list/show/inspect/analyze. |
 
-Legacy `Experiment`, `Task`, `Variant`, `runner.py`, `workspace.py`, and `invoker.py` remain as a
-compatibility path when an experiment repository has no Pier job configs. New work should use Pier
-tasks and jobs.
+Legacy native `Experiment`, `Task`, `Variant`, `runner.py`, `workspace.py`, `invoker.py`, and the
+SQLite index have been removed from active code paths. All execution goes through Pier jobs.
 
 ## Copilot CLI installed agent
 
@@ -76,8 +73,7 @@ During normalization, `name: copilot-cli` becomes
 ## Design invariants
 
 1. **Pier jobs are canonical.** `jobs/<job>/<run-id>/` is the primary source of truth for new runs.
-2. **SQLite is derived.** `results/index.db` can be rebuilt from `jobs/` and legacy `results/`.
-3. **Copilot logs are primary for Copilot metrics.** ATIF is a fallback and cross-agent view.
-4. **Copilot CLI is not reimplemented.** The installed agent shells out to the real CLI.
-5. **Tests stay offline.** Unit tests use config and job fixtures, not Docker or real Copilot.
-6. **Secrets stay out of persisted config.** Auth is injected at run time via environment.
+2. **Copilot logs are primary for Copilot metrics.** ATIF is a fallback and cross-agent view.
+3. **Copilot CLI is not reimplemented.** The installed agent shells out to the real CLI.
+4. **Tests stay offline.** Unit tests use config and job fixtures, not Docker or real Copilot.
+5. **Secrets stay out of persisted config.** Auth is injected at run time via environment.
diff --git a/docs/authoring-experiments.md b/docs/authoring-experiments.md
index 3f749f6..7f67874 100644
--- a/docs/authoring-experiments.md
+++ b/docs/authoring-experiments.md
@@ -18,7 +18,6 @@ tasks/
       test.sh
       test_calculator.py
 jobs/       # Pier outputs, gitignored
-results/    # derived SQLite index, gitignored
 ```
 
 ## Task directory
@@ -119,8 +118,7 @@ Useful knobs:
 ## DeepSWE task corpora
 
 DeepSWE tasks already use the Harbor/Pier task format, including separate verifier environments.
-Do not convert them into legacy Python `Task`s. Generate a Pier job config that points at the
-DeepSWE checkout instead:
+Generate a Pier job config that points at the DeepSWE checkout:
 
 ```bash
 git clone https://github.com/datacurve-ai/deep-swe vendor/deep-swe
@@ -137,11 +135,12 @@ The generated config uses `datasets:` for a corpus and `tasks:` for a single tas
 ## Workflow
 
 ```bash
-uv run copilot-experiments run --dry-run
+uv run copilot-experiments validate
 uv run copilot-experiments run
 uv run copilot-experiments list
 uv run copilot-experiments show --last
-uv run copilot-experiments analyze --last --trial 1
+uv run copilot-experiments inspect --last
+uv run copilot-experiments analyze --last --agent copilot-cli --trial 1
 ```
 
 If you are working from a standalone experiment repo and want to use a local checkout of the
@@ -151,11 +150,11 @@ If you are working from a standalone experiment repo and want to use a local che
 ```bash
 export COPILOT_EXPERIMENTS_REPO=/path/to/github-copilot-lab
 
-uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments run --dry-run
+uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments validate
 uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments run
 uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments list
 uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments show --last
-uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments analyze --last --trial 1
+uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments analyze --last --agent copilot-cli --trial 1
 ```
 
 In PowerShell, use
@@ -163,9 +162,8 @@ In PowerShell, use
 `--from $env:COPILOT_EXPERIMENTS_REPO`. If you are iterating on the tool and need to force uv to
 rebuild from the working tree, add `--no-cache` before `--from`.
 
-`--dry-run` validates Pier configs and path normalization without starting a sandbox. The legacy
-Python experiment path still has an ephemeral mock dry-run, but Pier is the primary authoring
-model.
+`validate` checks Pier config loading, referenced task/dataset paths, backend availability, and
+Copilot auth without creating a run directory.
 
 `run` performs a lightweight backend preflight before Pier creates a job. For the default Docker
 backend it verifies that `docker`, `docker compose`, and the Docker daemon are reachable; this catches
@@ -181,7 +179,5 @@ After a run, `copilot-experiments list` prints copyable selectors. Use `job-name
 or analyze an exact Pier execution, `job-name` for that job's latest run, or `--last` for the most
 recent stored run across all jobs.
 
-## Legacy Python experiments
-
-The old `Experiment`, `Task`, and `Variant` API remains temporarily for migration and tests. It is
-used only when no Pier configs are found in `experiments/`. Do not use it for new experiment repos.
+`run` always executes Pier jobs. Native Python `Experiment`/`Task`/`Variant` experiments are no
+longer supported by the CLI.
diff --git a/docs/collecting-run-data.md b/docs/collecting-run-data.md
index c990dc4..df31fe1 100644
--- a/docs/collecting-run-data.md
+++ b/docs/collecting-run-data.md
@@ -20,15 +20,10 @@ native Copilot events.
 | `verifier/reward.txt` / `reward.json` | Always | Distinguishes "Copilot ran" from "the task was solved". |
 | Pier `artifacts/` | For code tasks | Captures requested files or directories from the final sandbox state. |
 | `summary.json` / `summary.md` | Derived | Human and machine summaries generated by `copilot-experiments`; rebuildable from job artifacts. |
-| `results/index.db` | Derived | SQLite cache over `jobs/` and legacy `results/`; rebuildable with `reindex`. |
 | OTel traces/metrics | Default local file for Copilot agent runs | Standard live observability for agent, LLM, and tool spans. The harness preserves a local file by default when no explicit OTLP destination is configured, giving per-LLM-call input/output/cache-write/nano-AIU data that native session events only aggregate at shutdown. |
 | Copilot `--share` markdown | Optional | Human-readable transcript. Not emitted by the Pier agent by default, but usable in standalone runs or via extra CLI args. |
 | Copilot debug logs (`--log-dir`, `--log-level`) | Debug only | Useful for CLI internals, exporter startup, auth, MCP, or permissions. Treat as sensitive and bulky. |
 
-Legacy Python experiments still use the old `results/<experiment>/<run>/.../trials/<NNN>/`
-layout with `events.jsonl`, `stdout.txt`, `session.md`, `metrics.json`, `analysis.json`,
-`workspace.diff`, and `verify.json`. New work should use Pier jobs.
-
 ## Publicly documented vs observed
 
 The public GitHub docs document what Copilot CLI is and how to run it interactively or
@@ -103,7 +98,7 @@ jobs/
         artifacts/
 ```
 
-`show`, `analyze`, `inspect`, and `reindex` read this tree. For Copilot trials they prefer native
+`show`, `analyze`, and `inspect` read this tree. For Copilot trials they prefer native
 `events.jsonl`; if no native events exist, `analyze` and summaries can fall back to
 `agent/trajectory.json`.
 
@@ -535,8 +530,7 @@ server connection attempts by transport and outcome.
 OTel is best for live, cross-session observability and standard backend integrations.
 `events.jsonl` remains the richer per-session forensic artifact, but OTel is the only observed
 surface that currently exposes per-LLM-call input/cache-write/AIU fields. `analyze` auto-discovers
-`agent/copilot-otel.jsonl` for Pier jobs and `copilot-otel.jsonl` in legacy trial directories; for
-direct file analysis, pass it explicitly:
+`agent/copilot-otel.jsonl` for Pier jobs; for direct file analysis, pass it explicitly:
 
 ```bash
 uv run copilot-experiments analyze --file events.jsonl --otel-file copilot-otel.jsonl
@@ -549,7 +543,8 @@ Pier, Copilot, ATIF, and OTel data:
 
 ```json
 {
-  "job_name": "example-fix-bug-20260620-151500",
+  "job_name": "example-fix-bug",
+  "run_id": "20260620-151500",
   "trial_name": "copilot-cli__example-fix-bug__1",
   "session_id": "0cb916db-26aa-40f2-86b5-1ba81b225fd2",
   "copilot_version": "1.0.64-0",
@@ -558,8 +553,8 @@ Pier, Copilot, ATIF, and OTel data:
   "reasoning_effort": "low",
   "mode": "autopilot",
   "artifacts": {
-    "pier_job": "jobs/example-fix-bug-20260620-151500/result.json",
-    "pier_trial": "jobs/example-fix-bug-20260620-151500/copilot-cli__example-fix-bug__1/result.json",
+    "pier_job": "jobs/example-fix-bug/20260620-151500/result.json",
+    "pier_trial": "jobs/example-fix-bug/20260620-151500/copilot-cli__example-fix-bug__1/result.json",
     "events": "agent/copilot-session/0cb916db-26aa-40f2-86b5-1ba81b225fd2/events.jsonl",
     "cli_jsonl": "agent/copilot-cli.jsonl",
     "cli_text": "agent/copilot-cli.txt",
@@ -572,7 +567,7 @@ Pier, Copilot, ATIF, and OTel data:
 
 Minimum retention policy: keep the Pier job and trial `result.json` files, native
 `events.jsonl`, raw CLI streams, `trajectory.json`, verifier output, and requested artifacts.
-`summary.json`, `summary.md`, and `results/index.db` are useful but derived.
+`summary.json` and `summary.md` are useful but derived.
 
 ## Collection checklist
 
@@ -582,7 +577,7 @@ Minimum retention policy: keep the Pier job and trial `result.json` files, nativ
 4. Keep `agent/copilot-cli.jsonl` and `.txt` for diagnostics.
 5. Keep `agent/trajectory.json` for ATIF and fallback analysis.
 6. Keep verifier rewards and requested artifacts.
-7. Generate summaries and reindex from the job tree.
+7. Generate summaries from the job tree.
 8. Keep the default `agent/copilot-otel.jsonl`, or configure an OTLP collector / explicit file path
    and add Pier identifiers through `OTEL_RESOURCE_ATTRIBUTES`.
 9. Redact secrets and review raw logs before sharing any artifacts.
diff --git a/docs/deepswe.md b/docs/deepswe.md
index 5b36fac..787545e 100644
--- a/docs/deepswe.md
+++ b/docs/deepswe.md
@@ -59,11 +59,12 @@ datasets:
 Validate and run it like any other Pier experiment:
 
 ```bash
-uv run copilot-experiments run --dry-run
+uv run copilot-experiments validate
 uv run copilot-experiments run deepswe-smoke
 uv run copilot-experiments list
 uv run copilot-experiments show --last
-uv run copilot-experiments analyze --last --trial 1
+uv run copilot-experiments inspect --last
+uv run copilot-experiments analyze --last --agent copilot-cli --trial 1
 ```
 
 ## Selecting tasks
diff --git a/docs/results-format.md b/docs/results-format.md
index 06e533a..10b6a45 100644
--- a/docs/results-format.md
+++ b/docs/results-format.md
@@ -1,7 +1,8 @@
 # Results format
 
-For new runs, Pier job directories under `jobs/` are the filesystem source of truth. The SQLite
-database under `results/index.db` is a derived cache.
+Pier job directories under `jobs/` are the filesystem source of truth. `copilot-experiments`
+derives summaries on demand from Pier results and Copilot-native logs; there is no separate result
+index to rebuild.
 
 For a source-by-source explanation of what can be captured around a Copilot CLI run, see
 [Collecting data from a Copilot CLI run](collecting-run-data.md).
@@ -36,67 +37,53 @@ jobs/
 
 Pier owns `config.json`, `result.json`, trial directories, logs, verifier outputs, and artifact
 download. `copilot-experiments` adds `copilot-experiments-run.json` to preserve the stable
-`job_name` plus concrete `run_id`, then derives summaries and indexes from that tree.
+`job_name` plus concrete `run_id`, then derives summaries from that tree.
 
 ## Key files
 
 | File | Meaning |
 | --- | --- |
 | `jobs/<job>/<run-id>/result.json` | Pier job-level status and stats for one execution. |
-| `jobs/<job>/<run-id>/copilot-experiments-run.json` | Stable job name and concrete run id used by summaries, lookup, and indexing. |
+| `jobs/<job>/<run-id>/copilot-experiments-run.json` | Stable job name and concrete run id used by summaries and lookup. |
 | `jobs/<job>/<run-id>/<trial>/result.json` | Pier trial status, agent info, verifier result, exceptions, timings. |
 | `agent/trajectory.json` | ATIF trajectory emitted by the installed agent. Copilot agent steps include OTel per-LLM-call metrics when `copilot-otel.jsonl` is available; the file is also used as a fallback for non-Copilot agents. |
 | `agent/copilot-cli.jsonl` / `.txt` | Raw Copilot CLI output streams. Useful for auth or CLI failures. |
 | `agent/copilot-session/**/events.jsonl` | Native Copilot session log. Primary source for Copilot turns, tool calls, tokens, AIU, and analysis. |
 | `agent/copilot-otel.jsonl` | Copilot OTel file-exporter output, captured by default for Copilot agent runs unless custom OTel destination settings override it. Useful for per-LLM-call spans with input/output/cache-write/nano-AIU details. |
 | `verifier/reward.txt` / `.json` | Pier verifier reward. Positive reward means solved. |
-| `summary.json` / `summary.md` | Derived summary in the familiar variant/task aggregate shape. |
+| `summary.json` / `summary.md` | Derived agent/task aggregate summary. |
 
 Pier jobs do not persist per-trial `metrics.json` or `analysis.json` files. Those views are
 derived from `agent/copilot-session/**/events.jsonl` (or `agent/trajectory.json` as a fallback)
-when `show`, `analyze`, `inspect`, or `reindex` runs. Legacy Python runs still keep those files in
-their `results/<experiment>/<run>/.../trials/<NNN>/` layout.
+when `show`, `analyze`, or `inspect` runs.
 
 ## Summary shape
 
 `summary.json` contains:
 
-- job identity and status (`run_id`, `experiment`, `started_at`, `finished_at`);
-- aggregate counts (`n_variants`, `n_tasks`, `n_trials`, failures);
+- job identity and status (`job`, `job_name`, `run_id`, `started_at`, `finished_at`);
+- aggregate counts (`n_agents`, `n_tasks`, `n_trials`, failures);
 - `overall_success_rate` from verifier rewards;
-- one entry per agent/model variant;
-- one task aggregate per variant;
+- one entry per Pier agent;
+- one task aggregate per agent;
 - Copilot-native token/AIU/tool metrics when native events are available;
 - nullable fallback metrics for non-Copilot agents.
 
-## SQLite index
-
-`reindex` rebuilds `results/index.db` from both `jobs/` and legacy `results/`.
-
-New Pier tables:
-
-```sql
-pier_jobs(id PK, job_name, run_id, job_dir, started_at, finished_at, n_trials,
-          success_rate, status)
-pier_trials(id PK, job_id, job_name, run_id, variant_slug, task_slug, trial_name,
-            success, status, n_turns, n_tool_calls, total_tokens, aiu, model, error)
-```
-
-Legacy tables (`experiments`, `runs`, `variants`, `tasks`, `trials`) remain for old Python runs.
-
 ## Analyzing a trial
 
 ```bash
 uv run copilot-experiments list
-uv run copilot-experiments analyze --last --trial 1
-uv run copilot-experiments analyze <job-name> --trial 1
-uv run copilot-experiments analyze <job-name>/<run-id> --trial 1
+uv run copilot-experiments analyze --last --agent copilot-cli --trial 1
+uv run copilot-experiments analyze <job-name> --agent copilot-cli --trial 1
+uv run copilot-experiments analyze <job-name>/<run-id> --agent copilot-cli --trial 1
 uv run copilot-experiments analyze --file jobs/<job>/<run-id>/<trial>/agent/copilot-session/.../events.jsonl
 ```
 
 `list` is the discovery command for run ids. For Pier outputs, its `selector (job/run)` column is
 the exact string accepted by `show`, `inspect`, and `analyze`. Passing only `<job-name>` selects
-that job's latest run; passing `<job-name>/<run-id>` selects one concrete execution.
+that job's latest run; passing `<job-name>/<run-id>` selects one concrete execution. Use
+`inspect <selector>` to discover exact `--agent`, `--task`, and `--trial` values before calling
+`analyze`.
 
 If the selected Pier trial has no native Copilot `events.jsonl`, `analyze` falls back to
 `agent/trajectory.json` when present; otherwise it reports that no Copilot session log or
diff --git a/examples/task_suite/README.md b/examples/task_suite/README.md
index 4fee794..e222183 100644
--- a/examples/task_suite/README.md
+++ b/examples/task_suite/README.md
@@ -1,6 +1,6 @@
 # Task-suite example
 
-A Pier-native job that runs one cheap Copilot variant across two tasks of different
+A Pier-native job that runs one cheap Copilot agent across two tasks of different
 difficulty:
 
 | Task | Directory | Difficulty | What the model must do |
@@ -13,7 +13,7 @@ difficulty:
 From the repository root:
 
 ```bash
-uv run copilot-experiments run  --root examples/task_suite --dry-run
+uv run copilot-experiments validate --root examples/task_suite
 uv run copilot-experiments run  --root examples/task_suite
 uv run copilot-experiments show --root examples/task_suite --last
 ```
@@ -23,7 +23,6 @@ The job config is `experiments/suite.yaml`. It pins `gpt-5-mini` at `low` effort
 
 ## What to expect
 
-Pier writes one canonical job directory under `jobs/task-suite-strtools-csvtools/`, with a
-trial directory for every `agent x task x attempt` cell. `copilot-experiments show` adapts
-those Pier outputs into the familiar per-variant/per-task summary, while `analyze` reads the
-native Copilot `events.jsonl` from a selected trial.
+Pier writes concrete executions under `jobs/task-suite-strtools-csvtools/<run-id>/`, with a
+trial directory for every `agent x task x attempt` cell. `copilot-experiments show` derives an
+agent/task summary, while `analyze` reads the native Copilot `events.jsonl` from a selected trial.
diff --git a/examples/tracer_bullet/README.md b/examples/tracer_bullet/README.md
index 4e0faa9..fdf9dac 100644
--- a/examples/tracer_bullet/README.md
+++ b/examples/tracer_bullet/README.md
@@ -10,7 +10,7 @@ From the repository root:
 
 ```bash
 # Validate the Pier JobConfig without starting a sandbox.
-uv run copilot-experiments run --root examples/tracer_bullet --dry-run
+uv run copilot-experiments validate --root examples/tracer_bullet
 
 # Real run through Pier. Requires Copilot auth and a supported Pier backend.
 uv run copilot-experiments run     --root examples/tracer_bullet
@@ -22,13 +22,11 @@ The job pins `gpt-5-mini` at `low` reasoning effort in
 `experiments/textstats.yaml` so the smoke test stays inexpensive. Change that YAML to compare
 models, efforts, or attempts.
 
-Re-running the command creates a fresh timestamped job if `jobs/tracer-bullet-textstats/` already
-exists. Use `--resume` only to continue an interrupted Pier job and intentionally skip completed
-trials.
+Re-running the command creates a fresh timestamped run under `jobs/tracer-bullet-textstats/`.
+Use `--resume` only to continue an interrupted Pier job and intentionally skip completed trials.
 
 ## What gets captured
 
-Pier writes the first job under `jobs/tracer-bullet-textstats/` and subsequent reruns under
-timestamped sibling directories. Each trial keeps Pier's `result.json`, verifier output, requested
-artifacts, ATIF `trajectory.json`, raw Copilot CLI stdout/JSONL, and native Copilot
-`copilot-session/**/events.jsonl` for AIU/token/session analysis.
+Pier writes concrete executions under `jobs/tracer-bullet-textstats/<run-id>/`. Each trial keeps
+Pier's `result.json`, verifier output, requested artifacts, ATIF `trajectory.json`, raw Copilot CLI
+stdout/JSONL, and native Copilot `copilot-session/**/events.jsonl` for AIU/token/session analysis.
diff --git a/sandbox/README.md b/sandbox/README.md
index 2a2938c..8be5e1c 100644
--- a/sandbox/README.md
+++ b/sandbox/README.md
@@ -6,9 +6,10 @@ Anything under `sandbox/` is gitignored (except this README and `.gitkeep`), so
 safe to generate throwaway experiment repos and runs here.
 
 ```bash
-# scaffold a throwaway experiment repo and dry-run it (no Copilot credits used)
+# scaffold a throwaway experiment repo and validate it
 uv run copilot-experiments init sandbox/demo
 cd sandbox/demo
-uv run copilot-experiments run --dry-run
+uv run copilot-experiments validate
+uv run copilot-experiments run
 uv run copilot-experiments show --last
 ```
diff --git a/src/copilot_experiments/__init__.py b/src/copilot_experiments/__init__.py
index e179a93..dfabde6 100644
--- a/src/copilot_experiments/__init__.py
+++ b/src/copilot_experiments/__init__.py
@@ -1,14 +1,4 @@
-"""copilot-experiments: a library + CLI for GitHub Copilot research experiments.
-
-Public API
-----------
-Author Pier/Harbor task directories and run Pier jobs that include the real
-GitHub Copilot CLI as an installed agent. The legacy Python experiment API is
-still exported for migration and offline tests.
-
-Pier configs can refer to the local Copilot agent import path exported as
-``COPILOT_CLI_AGENT_IMPORT_PATH``.
-"""
+"""copilot-experiments: Pier-first evaluation harness for GitHub Copilot CLI agents."""
 
 from __future__ import annotations
 
@@ -20,58 +10,25 @@
     discover_deepswe_source,
     write_deepswe_job_config,
 )
-from .models import (
-    DryRunCheck,
-    DryRunReport,
-    Experiment,
-    ExperimentRun,
-    LlmCallSummary,
-    Metrics,
-    ProviderConfig,
-    SessionAnalysis,
-    Task,
-    TaskResult,
-    ToolStat,
-    TrialResult,
-    TurnSummary,
-    Variant,
-    VariantResult,
-)
+from .models import LlmCallSummary, Metrics, SessionAnalysis, ToolStat, TurnSummary
 from .pier_backend import COPILOT_CLI_AGENT_IMPORT_PATH, discover_pier_job_configs, run_pier_job
-from .runner import dry_run_experiment, run_experiment
 
 __all__ = [
-    "DryRunCheck",
-    "DryRunReport",
+    "COPILOT_CLI_AGENT_IMPORT_PATH",
     "DeepSweImportError",
     "DeepSweImportResult",
     "DeepSweSource",
-    "Experiment",
-    "ExperimentRun",
     "LlmCallSummary",
     "Metrics",
-    "ProviderConfig",
     "SessionAnalysis",
-    "Task",
-    "TaskResult",
     "ToolStat",
-    "TrialResult",
     "TurnSummary",
-    "Variant",
-    "VariantResult",
-    "COPILOT_CLI_AGENT_IMPORT_PATH",
     "analyze_events",
     "discover_deepswe_source",
     "discover_pier_job_configs",
-    "dry_run_experiment",
     "llm_calls_from_otel",
     "run_pier_job",
-    "run_experiment",
     "write_deepswe_job_config",
-    "run",
 ]
 
-# Convenient alias.
-run = run_experiment
-
 __version__ = "0.2.0"
diff --git a/src/copilot_experiments/_util.py b/src/copilot_experiments/_util.py
index 8e1f9c9..3fe0490 100644
--- a/src/copilot_experiments/_util.py
+++ b/src/copilot_experiments/_util.py
@@ -62,7 +62,7 @@ def force_rmtree(path: Path) -> None:
     tree contains a git workspace: paths under ``.git/objects`` can exceed the
     260-char ``MAX_PATH`` limit, and git marks object/pack files read-only. We
     prepend the ``\\\\?\\`` long-path prefix and, on error, clear the read-only
-    bit and retry, so an ephemeral dry-run can always remove its temp dir.
+    bit and retry, so temporary trees can always be removed.
     """
     if not path.exists():
         return
diff --git a/src/copilot_experiments/auth.py b/src/copilot_experiments/auth.py
index 5a2b610..86e053a 100644
--- a/src/copilot_experiments/auth.py
+++ b/src/copilot_experiments/auth.py
@@ -1,20 +1,16 @@
 """Resolve and preflight the GitHub token used to authenticate Copilot CLI.
 
 Leaving authentication to the ``copilot`` subprocess means a missing token is only
-discovered *after* a workspace has been provisioned and the CLI has spun up -- every
+discovered *after* Pier has prepared a sandbox and the CLI has spun up -- every
 trial then burns time and produces an empty session log. Instead we resolve a token
 *once* before the run starts (failing fast if none is available) and inject it into
 each trial's environment.
 
 Security -- the token must NEVER be leaked:
 
-* The resolved token is only ever placed in a child process's environment at runtime
-  (via :attr:`~copilot_experiments.invoker.Invocation.env_overrides`). It is never
-  written to a stored artifact and never logged -- only its *source* is reported.
-* The names of the variables that carry it (plus any BYOK provider secrets) are passed
-  to ``copilot --secret-env-vars`` so the CLI strips them from shell/MCP environments
-  and redacts their values from its own output: stdout, and the ``--share`` markdown
-  transcript.
+* The resolved token is only ever injected into Pier's Copilot CLI agent
+  environment at runtime. It is never written to a stored artifact and never
+  logged -- only its *source* is reported.
 """
 
 from __future__ import annotations
@@ -28,12 +24,6 @@
 # Token environment variables Copilot itself recognizes, in resolution precedence order.
 GITHUB_TOKEN_ENV_VARS = ("COPILOT_GITHUB_TOKEN", "GH_TOKEN", "GITHUB_TOKEN")
 
-# The variable the resolved token is injected under for the Copilot child process.
-INJECTED_TOKEN_ENV_VAR = "COPILOT_GITHUB_TOKEN"
-
-# Provider (BYOK) environment variables whose values are secrets and must be redacted.
-_PROVIDER_SECRET_ENV_VARS = ("COPILOT_PROVIDER_API_KEY", "COPILOT_PROVIDER_BEARER_TOKEN")
-
 
 class AuthError(RuntimeError):
     """No usable GitHub token could be resolved for the run."""
@@ -106,21 +96,3 @@ def preflight_github_token(env: Mapping[str, str] | None = None) -> TokenResolut
             f"{', '.join(GITHUB_TOKEN_ENV_VARS)}, or run 'gh auth login'."
         )
     return resolution
-
-
-def secret_env_names(variant_env: Mapping[str, str], *, byok_secrets: bool) -> list[str]:
-    """Names whose values Copilot must redact from output and strip from sub-shells.
-
-    Always includes the GitHub token variables (so an injected or inherited token is
-    never echoed). Adds BYOK provider secret variables when the variant uses a provider
-    with secrets, plus any free-form ``variant.env`` keys that look like a secret.
-    """
-    from .models import _SECRET_ENV_HINT
-
-    names: set[str] = set(GITHUB_TOKEN_ENV_VARS)
-    if byok_secrets:
-        names.update(_PROVIDER_SECRET_ENV_VARS)
-    for key in variant_env:
-        if _SECRET_ENV_HINT.search(key):
-            names.add(key)
-    return sorted(names)
diff --git a/src/copilot_experiments/cli.py b/src/copilot_experiments/cli.py
index 4bd797d..75b9d97 100644
--- a/src/copilot_experiments/cli.py
+++ b/src/copilot_experiments/cli.py
@@ -2,12 +2,9 @@
 
 from __future__ import annotations
 
-import importlib.util
 import sys
-from collections.abc import Callable
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Literal
 
 import typer
 from rich.console import Console
@@ -17,11 +14,9 @@
 from .analysis import analyze_events, analyze_trajectory
 from .auth import AuthError, preflight_github_token
 from .deepswe import DeepSweImportError, write_deepswe_job_config
-from .index import list_runs as index_list_runs
-from .index import reindex as index_reindex
-from .models import DryRunReport, Experiment, ExperimentRun
 from .pier_backend import (
     PierBackendPreflightError,
+    PierJobSpec,
     discover_pier_job_configs,
     inject_copilot_token,
     preflight_pier_backend,
@@ -37,20 +32,14 @@
     write_pier_summary,
 )
 from .render import render_session_analysis
-from .runner import dry_run_experiment, run_experiment
 from .scaffold import ScaffoldError, init_experiment_repo
 from .sessionlog import load_events
 from .storage import Layout
 
 
 def _force_utf8_streams() -> None:
-    """Make stdout/stderr UTF-8 so Rich glyphs (e.g. ``✓``) don't crash.
+    """Make stdout/stderr UTF-8 so Rich glyphs do not crash on Windows."""
 
-    On Windows the console and redirected pipes default to a legacy code page
-    (cp1252), which raises ``UnicodeEncodeError`` on non-Latin-1 characters.
-    ``errors="replace"`` is a belt-and-braces fallback for any remaining
-    unencodable glyph.
-    """
     for stream in (sys.stdout, sys.stderr):
         reconfigure = getattr(stream, "reconfigure", None)
         if reconfigure is not None:
@@ -64,7 +53,7 @@ def _force_utf8_streams() -> None:
 
 app = typer.Typer(
     add_completion=False,
-    help="Build and analyze GitHub Copilot research experiments.",
+    help="Create, run, and analyze Pier jobs that evaluate GitHub Copilot CLI agents.",
     no_args_is_help=True,
 )
 console = Console()
@@ -73,83 +62,37 @@ def _force_utf8_streams() -> None:
 
 @dataclass(frozen=True)
 class ResolvedRun:
-    kind: Literal["legacy", "pier"]
     path: Path
     selector: str
 
 
-# --------------------------------------------------------------------------- #
-# Experiment discovery
-# --------------------------------------------------------------------------- #
-def _load_experiments(experiments_dir: Path) -> list[tuple[Path, Experiment]]:
-    """Import every ``*.py`` under ``experiments/`` and collect Experiment objects.
-
-    A module contributes experiments via a module-level ``experiment`` (single),
-    ``experiments`` (iterable), a ``get_experiments()`` function, or any
-    module-level :class:`Experiment` instances.
-    """
-    found: list[tuple[Path, Experiment]] = []
-    if not experiments_dir.is_dir():
-        return found
+@dataclass(frozen=True)
+class ValidationCheck:
+    name: str
+    ok: bool
+    detail: str = ""
 
-    root = experiments_dir.parent.resolve()
-    if str(root) not in sys.path:
-        sys.path.insert(0, str(root))
 
-    for path in sorted(experiments_dir.glob("*.py")):
-        if path.name.startswith("_"):
-            continue
-        module = _import_path(path)
-        candidates: list[object] = []
-        if hasattr(module, "get_experiments"):
-            candidates.extend(list(module.get_experiments()))
-        if hasattr(module, "experiments"):
-            candidates.extend(list(module.experiments))
-        if hasattr(module, "experiment"):
-            candidates.append(module.experiment)
-        if not candidates:
-            candidates = [v for v in vars(module).values() if isinstance(v, Experiment)]
-        seen: set[int] = set()
-        for obj in candidates:
-            if isinstance(obj, Experiment) and id(obj) not in seen:
-                seen.add(id(obj))
-                found.append((path, obj))
-    return found
-
-
-def _import_path(path: Path):
-    name = f"copilot_experiments_user_{path.stem}"
-    spec = importlib.util.spec_from_file_location(name, path)
-    if spec is None or spec.loader is None:
-        raise typer.BadParameter(f"Cannot import experiment module: {path}")
-    module = importlib.util.module_from_spec(spec)
-    sys.modules[name] = module
-    spec.loader.exec_module(module)
-    return module
-
-
-# --------------------------------------------------------------------------- #
-# Commands
-# --------------------------------------------------------------------------- #
 @app.command()
 def init(
-    directory: Path = typer.Argument(..., help="Directory for the new experiment repository."),
-    name: str | None = typer.Option(None, "--name", help="Project name (defaults to dir name)."),
-    force: bool = typer.Option(
-        False, "--force", help="Scaffold even if the directory is not empty."
-    ),
+    directory: Path = typer.Argument(..., help="Directory to create or update."),
+    name: str | None = typer.Option(None, "--name", help="Project/package name."),
+    force: bool = typer.Option(False, "--force", help="Overwrite existing scaffolded files."),
 ) -> None:
-    """Scaffold a new, standalone experiment repository."""
+    """Scaffold a standalone Pier experiment repository."""
+
     try:
-        created = init_experiment_repo(directory, project_name=name, force=force)
+        init_experiment_repo(directory, name=name, force=force)
     except ScaffoldError as exc:
-        err.print(f"[red]error:[/red] {exc}")
+        err.print(f"[red]Scaffold error:[/red] {exc}")
         raise typer.Exit(1) from exc
-    console.print(f"[green]Created {len(created)} files in[/green] {directory}")
-    console.print("\nNext steps:")
+
+    console.print(f"[green]Initialized[/green] Pier experiment repository at {directory}")
+    console.print("Next steps:")
     console.print(f"  cd {directory}")
     console.print("  uv sync")
-    console.print("  uv run copilot-experiments run --dry-run")
+    console.print("  uv run copilot-experiments validate")
+    console.print("  uv run copilot-experiments run")
 
 
 @app.command("deepswe-import")
@@ -250,141 +193,51 @@ def deepswe_import(
         f"[dim]source:[/dim] {result.source.path} "
         f"({source_label}, {result.source.task_count} {task_label})"
     )
-    console.print("[dim]validate:[/dim] uv run copilot-experiments run --dry-run")
+    console.print("[dim]validate:[/dim] uv run copilot-experiments validate")
+
+
+@app.command()
+def validate(
+    name: str | None = typer.Argument(None, help="Only validate this Pier job name or file stem."),
+    root: Path | None = typer.Option(None, "--root", help="Experiment repository root."),
+) -> None:
+    """Validate Pier job configs, local paths, auth, and backend preflight checks."""
+
+    root = Path(root or Path.cwd())
+    specs = _require_pier_specs(root, name=name)
+    checks = _validate_pier_specs(specs)
+    _print_job_config_table(root, specs)
+    _print_validation_checks(checks)
+    if not all(check.ok for check in checks):
+        raise typer.Exit(1)
 
 
 @app.command()
 def run(
-    name: str | None = typer.Argument(None, help="Only run the experiment with this name/slug."),
+    name: str | None = typer.Argument(None, help="Only run this Pier job name or file stem."),
     root: Path | None = typer.Option(None, "--root", help="Experiment repository root."),
-    dry_run: bool = typer.Option(
-        False,
-        "--dry-run",
-        help="Validate the whole pipeline in a throwaway dir and persist nothing.",
-    ),
-    copilot_binary: str = typer.Option("copilot", "--copilot", help="Path to the copilot binary."),
     verbose: bool = typer.Option(
         False,
         "--verbose",
         "-v",
-        help="Enable debug-level Pier output. Legacy experiments also stream Copilot output.",
+        help="Enable debug-level Pier output.",
     ),
     resume: bool = typer.Option(
         False,
         "--resume",
-        help=(
-            "Resume an existing Pier job directory instead of creating a fresh rerun when the "
-            "configured job name already exists."
-        ),
+        help="Resume the latest existing run for the selected Pier job when possible.",
     ),
 ) -> None:
-    """Discover and run experiment(s) defined under ``experiments/``.
-
-    With ``--dry-run`` the full pipeline is exercised with the mock invoker inside a
-    temporary directory, each stage is validated, and everything is deleted again --
-    no run is recorded under ``results/``.
+    """Run Pier job config(s) defined under ``experiments/``."""
 
-    Pier configs create a fresh job directory on rerun when the configured job name
-    already exists. Pass ``--resume`` to opt into Pier's native resume behavior, which
-    skips trials that already completed for the same resolved config.
-    """
     root = Path(root or Path.cwd())
-    layout = Layout(root)
-    pier_specs = discover_pier_job_configs(root, name=name)
-    if pier_specs:
-        if dry_run:
-            table = Table(title="Pier job configs", show_edge=False)
-            table.add_column("job")
-            table.add_column("config")
-            table.add_column("tasks", justify="right")
-            table.add_column("agents", justify="right")
-            for spec in pier_specs:
-                table.add_row(
-                    spec.name,
-                    str(spec.path.relative_to(root)),
-                    str(len(spec.config.tasks) + len(spec.config.datasets)),
-                    str(len(spec.config.agents)),
-                )
-            console.print(table)
-            console.print("[green]Pier config validation OK[/green] [dim]— no job was run[/dim]")
-            raise typer.Exit(0)
-
-        try:
-            for spec in pier_specs:
-                preflight_pier_backend(spec.config)
-        except PierBackendPreflightError as exc:
-            err.print(f"[red]Pier backend preflight failed:[/red] {exc}")
-            raise typer.Exit(1) from exc
-
-        try:
-            auth = preflight_github_token()
-        except AuthError as exc:
-            err.print(f"[red]Authentication error:[/red] {exc}")
-            raise typer.Exit(1) from exc
-        console.print(f"[dim]auth:[/dim] using GitHub token from {auth.source}")
-
-        any_failures = False
-        for spec in pier_specs:
-            prepared = prepare_pier_job_for_run(spec.config, resume=resume)
-            if verbose:
-                prepared.config.debug = True
-            inject_copilot_token(prepared.config, auth.token)
-            console.print(f"[bold]Running Pier job[/bold] {prepared.label}")
-            if prepared.resumed:
-                console.print(f"[dim]resume:[/dim] reusing existing Pier run {prepared.label}")
-            else:
-                console.print(
-                    f"[dim]run:[/dim] writing fresh run to "
-                    f"{Path(prepared.config.jobs_dir) / prepared.run_name}"
-                )
-            try:
-                run_result = run_pier_job(prepared.config)
-            except Exception as exc:
-                err.print(f"[red]Pier job failed:[/red] {type(exc).__name__}: {exc}")
-                any_failures = True
-                continue
-            write_pier_run_manifest(
-                run_result.job_dir,
-                job_name=prepared.requested_name,
-                run_id=prepared.run_name,
-            )
-            summary = write_pier_summary(run_result.job_dir)
-            _print_run_summary(summary)
-            _warn_failed_pier_trials(run_result.job_dir)
-            if summary.get("status") != "completed":
-                any_failures = True
-            console.print(f"[dim]results:[/dim] {run_result.job_dir}\n")
-
-        if any_failures:
-            raise typer.Exit(2)
-        raise typer.Exit(0)
-
-    experiments = _load_experiments(layout.experiments_dir)
-    if not experiments:
-        err.print(f"[yellow]No experiments found in[/yellow] {layout.experiments_dir}")
+    specs = _require_pier_specs(root, name=name)
+    checks = _validate_pier_specs(specs)
+    failed_checks = [check for check in checks if not check.ok]
+    if failed_checks:
+        _print_validation_checks(checks)
         raise typer.Exit(1)
 
-    if name:
-        experiments = [(p, e) for p, e in experiments if name in (e.name, e.slug)]
-        if not experiments:
-            err.print(f"[red]No experiment matched[/red] {name!r}")
-            raise typer.Exit(1)
-
-    if dry_run:
-        all_ok = True
-        for _path, experiment in experiments:
-            console.print(
-                f"[bold]Dry-run[/bold] {experiment.name} "
-                f"({len(experiment.variants)} variant(s)) [dim]— validating plumbing[/dim]"
-            )
-            report = dry_run_experiment(experiment, root=root)
-            _print_dry_run_report(report)
-            all_ok = all_ok and report.ok
-        raise typer.Exit(0 if all_ok else 1)
-
-    # Preflight authentication ONCE so a missing token aborts immediately instead of
-    # failing every trial after provisioning. The token is injected into each trial's
-    # environment; it is never logged (only its source) or persisted.
     try:
         auth = preflight_github_token()
     except AuthError as exc:
@@ -393,29 +246,37 @@ def run(
     console.print(f"[dim]auth:[/dim] using GitHub token from {auth.source}")
 
     any_failures = False
-    for _path, experiment in experiments:
-        console.print(
-            f"[bold]Running[/bold] {experiment.name} ({len(experiment.variants)} variant(s))"
-        )
-        progress = _make_progress() if verbose else None
-        copilot_stream = _make_copilot_stream() if verbose else None
-        run_obj = run_experiment(
-            experiment,
-            root=root,
-            copilot_binary=copilot_binary,
-            github_token=auth.token,
-            progress=progress,
-            copilot_stream=copilot_stream,
+    for spec in specs:
+        prepared = prepare_pier_job_for_run(spec.config, resume=resume)
+        if verbose:
+            prepared.config.debug = True
+        inject_copilot_token(prepared.config, auth.token)
+        console.print(f"[bold]Running Pier job[/bold] {prepared.label}")
+        if prepared.resumed:
+            console.print(f"[dim]resume:[/dim] reusing existing Pier run {prepared.label}")
+        else:
+            console.print(
+                f"[dim]run:[/dim] writing fresh run to "
+                f"{Path(prepared.config.jobs_dir) / prepared.run_name}"
+            )
+        try:
+            run_result = run_pier_job(prepared.config)
+        except Exception as exc:
+            err.print(f"[red]Pier job failed:[/red] {type(exc).__name__}: {exc}")
+            any_failures = True
+            continue
+        write_pier_run_manifest(
+            run_result.job_dir,
+            job_name=prepared.requested_name,
+            run_id=prepared.run_name,
         )
-        summary = read_json(layout.run_dir(experiment.slug, run_obj.run_id) / "summary.json")
+        summary = write_pier_summary(run_result.job_dir)
         _print_run_summary(summary)
-        _warn_failed_trials(layout, experiment, run_obj)
-        if run_obj.status != "completed":
+        _warn_failed_pier_trials(run_result.job_dir)
+        if summary.get("status") != "completed":
             any_failures = True
-        console.print(f"[dim]results:[/dim] {layout.run_dir(experiment.slug, run_obj.run_id)}\n")
+        console.print(f"[dim]results:[/dim] {run_result.job_dir}\n")
 
-    # A distinct exit code (2) lets scripts tell harness/infra trouble apart from a
-    # clean run (0) and usage errors like "no experiments found" (1).
     if any_failures:
         raise typer.Exit(2)
 
@@ -424,71 +285,39 @@ def run(
 def list_cmd(
     root: Path | None = typer.Option(None, "--root", help="Experiment repository root."),
 ) -> None:
-    """List experiment definitions and concrete run selectors."""
+    """List Pier job configs and concrete run selectors."""
+
     root = Path(root or Path.cwd())
-    layout = Layout(root)
-    pier_specs = discover_pier_job_configs(root)
-    if pier_specs:
-        table = Table(title="Pier job configs", show_edge=False)
-        table.add_column("job")
-        table.add_column("config")
-        table.add_column("tasks", justify="right")
-        table.add_column("agents", justify="right")
-        for spec in pier_specs:
-            table.add_row(
-                spec.name,
-                str(spec.path.relative_to(root)),
-                str(len(spec.config.tasks) + len(spec.config.datasets)),
-                str(len(spec.config.agents)),
-            )
-        console.print(table)
-
-    experiments = _load_experiments(layout.experiments_dir)
-    if experiments:
-        table = Table(title="Experiments", show_edge=False)
-        table.add_column("name")
-        table.add_column("slug")
-        table.add_column("variants", justify="right")
-        for _path, exp in experiments:
-            table.add_row(exp.name, exp.slug, str(len(exp.variants)))
-        console.print(table)
-
-    runs = index_list_runs(layout)
-    pier_jobs = layout.iter_pier_jobs()
-    if runs:
-        table = Table(title="Experiment runs")
-        table.add_column("selector")
-        table.add_column("experiment")
-        table.add_column("started")
-        table.add_column("trials", justify="right")
-        table.add_column("success", justify="right")
-        for r in runs:
-            sr = r.get("success_rate")
-            table.add_row(
-                r["run_id"],
-                r["experiment_slug"],
-                (r.get("started_at") or "")[:19],
-                str(r.get("n_trials") or 0),
-                "-" if sr is None else f"{sr * 100:.0f}%",
-            )
-        console.print(table)
+    specs = discover_pier_job_configs(root)
+    if specs:
+        _print_job_config_table(root, specs)
 
-    if not pier_jobs:
-        if not runs:
-            console.print("[dim]No runs yet.[/dim]")
+    layout = Layout(root)
+    runs = layout.iter_pier_jobs()
+    if not runs:
+        console.print("[dim]No runs yet.[/dim]")
         return
+
     table = Table(title="Pier runs")
     table.add_column("selector (job/run)", no_wrap=True)
+    table.add_column("job")
+    table.add_column("run")
     table.add_column("started")
+    table.add_column("agents", justify="right")
+    table.add_column("tasks", justify="right")
     table.add_column("trials", justify="right")
     table.add_column("success", justify="right")
     table.add_column("status")
-    for job_dir in pier_jobs:
+    for job_dir in runs:
         summary = write_pier_summary(job_dir)
         sr = summary.get("overall_success_rate")
         table.add_row(
             str(summary.get("pier_job_id") or pier_job_label(job_dir)),
+            str(summary.get("job") or "-"),
+            str(summary.get("run_id") or "-"),
             (summary.get("started_at") or "")[:19],
+            str(summary.get("n_agents") or 0),
+            str(summary.get("n_tasks") or 0),
             str(summary.get("n_trials") or 0),
             "-" if sr is None else f"{sr * 100:.0f}%",
             str(summary.get("status") or "-"),
@@ -500,27 +329,16 @@ def list_cmd(
 def show(
     selector: str | None = typer.Argument(
         None,
-        help=(
-            "Run selector from `list`: run id/prefix for legacy runs, Pier job for that "
-            "job's latest run, or Pier job/run id."
-        ),
+        help="Pier run selector from `list`: job, run id/prefix, or job/run.",
     ),
-    last: bool = typer.Option(False, "--last", help="Show the most recent stored run."),
+    last: bool = typer.Option(False, "--last", help="Show the most recent stored Pier run."),
     root: Path | None = typer.Option(None, "--root", help="Experiment repository root."),
 ) -> None:
-    """Print a run summary and per-variant comparison."""
-    root = Path(root or Path.cwd())
-    layout = Layout(root)
-    resolved = _resolve_run(layout, last=last, selector=selector)
-    if resolved is None:
-        _print_run_not_found(selector)
-        raise typer.Exit(1)
-    if resolved.kind == "pier":
-        summary = write_pier_summary(resolved.path)
-        _print_run_summary(summary)
-        console.print(f"\n[dim]{resolved.path / 'summary.md'}[/dim]")
-        return
-    _print_run_summary(read_json(resolved.path / "summary.json"))
+    """Print a Pier run summary and per-agent comparison."""
+
+    resolved = _resolve_or_exit(root, selector, last=last)
+    summary = write_pier_summary(resolved.path)
+    _print_run_summary(summary)
     console.print(f"\n[dim]{resolved.path / 'summary.md'}[/dim]")
 
 
@@ -528,124 +346,69 @@ def show(
 def inspect(
     selector: str | None = typer.Argument(
         None,
-        help=(
-            "Run selector from `list`: run id/prefix for legacy runs, Pier job for that "
-            "job's latest run, or Pier job/run id."
-        ),
+        help="Pier run selector from `list`: job, run id/prefix, or job/run.",
     ),
-    variant: str | None = typer.Option(None, "--variant", help="Variant slug."),
-    task: str | None = typer.Option(None, "--task", help="Task slug."),
-    trial: int | None = typer.Option(None, "--trial", help="Trial number."),
-    events: int = typer.Option(20, "--events", help="Number of session events to show."),
-    last: bool = typer.Option(False, "--last", help="Inspect the most recent stored run."),
+    agent: str | None = typer.Option(None, "--agent", help="Agent selector."),
+    task: str | None = typer.Option(None, "--task", help="Task selector."),
+    trial: str | None = typer.Option(None, "--trial", help="Trial number or Pier trial name."),
+    last: bool = typer.Option(False, "--last", help="Inspect the most recent stored Pier run."),
     root: Path | None = typer.Option(None, "--root", help="Experiment repository root."),
 ) -> None:
-    """Drill into a run's variants, tasks, trials, and session events."""
-    root = Path(root or Path.cwd())
-    layout = Layout(root)
-    resolved = _resolve_run(layout, last=last, selector=selector)
-    if resolved is None:
-        _print_run_not_found(selector)
-        raise typer.Exit(1)
-    if resolved.kind == "pier":
-        _inspect_pier_job(resolved.path)
-        return
-    run_dir = resolved.path
-
-    variants_dir = run_dir / "variants"
-    if variant is None:
-        table = Table(title=f"Variants in {run_dir.name}")
-        table.add_column("variant")
-        table.add_column("tasks", justify="right")
-        table.add_column("trials", justify="right")
-        for vdir in sorted(variants_dir.iterdir()):
-            tasks = sorted((vdir / "tasks").glob("*")) if (vdir / "tasks").is_dir() else []
-            n_trials = sum(
-                len(sorted((tk / "trials").glob("*"))) if (tk / "trials").is_dir() else 0
-                for tk in tasks
-            )
-            table.add_row(vdir.name, str(len(tasks)), str(n_trials))
-        console.print(table)
-        return
-
-    tasks_dir = variants_dir / variant / "tasks"
-    if task is None:
-        table = Table(title=f"Tasks in {variant}")
-        table.add_column("task")
-        table.add_column("trials", justify="right")
-        for tkdir in sorted(tasks_dir.iterdir()) if tasks_dir.is_dir() else []:
-            trials = sorted((tkdir / "trials").glob("*")) if (tkdir / "trials").is_dir() else []
-            table.add_row(tkdir.name, str(len(trials)))
-        console.print(table)
-        return
+    """Drill into a Pier run's agents, tasks, and trials."""
 
-    trials_dir = tasks_dir / task / "trials"
-    if trial is None:
-        table = Table(title=f"Trials in {variant}/{task}")
-        table.add_column("trial")
-        table.add_column("status")
-        table.add_column("success")
-        table.add_column("exit")
-        table.add_column("duration (s)", justify="right")
-        for tdir in sorted(trials_dir.iterdir()) if trials_dir.is_dir() else []:
-            meta = read_json(tdir / "meta.json")
-            table.add_row(
-                tdir.name,
-                str(meta.get("status", "-")),
-                str(meta.get("success")),
-                str(meta.get("exit_code")),
-                f"{meta.get('duration_s', 0):.2f}",
-            )
-        console.print(table)
-        return
+    resolved = _resolve_or_exit(root, selector, last=last)
+    summary = write_pier_summary(resolved.path)
+    console.print(f"[bold]Pier run[/bold]: {pier_job_label(resolved.path)}")
+    console.print(f"[bold]summary[/bold]: {resolved.path / 'summary.json'}")
 
-    tdir = trials_dir / f"{trial:03d}"
-    if not tdir.is_dir():
-        err.print(f"[red]Trial not found:[/red] {tdir}")
+    rows = _matching_trial_rows(resolved.path, agent=agent, task=task, trial=trial)
+    if not rows:
+        err.print("[red]No matching Pier trials.[/red]")
+        _print_trial_filter_hint()
         raise typer.Exit(1)
-    console.print(f"[bold]meta[/bold]: {read_json(tdir / 'meta.json')}")
-    meta = read_json(tdir / "meta.json")
-    if meta.get("status") and meta["status"] != "ok":
-        artifact = meta.get("error_artifact") or "stdout.txt"
-        console.print(
-            f"[yellow]status[/yellow]: {meta['status']} — {meta.get('error') or ''}\n"
-            f"  -> {tdir / artifact}"
+
+    _print_trials_table(rows, title=f"Trials in {pier_job_label(resolved.path)}")
+    if len(rows) == 1:
+        row = rows[0]
+        console.print(f"\n[bold]selected[/bold]: {row['trial_dir']}")
+        console.print(f"[bold]agent[/bold]: {row['agent']}")
+        console.print(f"[bold]task[/bold]: {row['task']}")
+        console.print(f"[bold]result[/bold]: {resolved.path / row['trial_dir'] / 'result.json'}")
+        source_path, _label, source_kind, _otel_path = resolve_pier_trial_analysis_source(
+            resolved.path, row["trial_dir"]
         )
-    console.print(f"[bold]metrics[/bold]: {read_json(tdir / 'metrics.json')}")
-    if (tdir / "verify.json").exists():
-        verify = read_json(tdir / "verify.json")
+        if source_path is not None:
+            console.print(f"[bold]analysis source[/bold]: {source_kind} · {source_path}")
+    elif agent or task or trial:
         console.print(
-            f"[bold]verify[/bold]: exit={verify['exit_code']} success={verify['success']}"
+            "\n[yellow]Multiple trials match.[/yellow] Add more filters, for example "
+            "`--agent`, `--task`, and `--trial`."
         )
-    evs = load_events(tdir / "events.jsonl")
-    console.print(f"\n[bold]events[/bold] (showing up to {events} of {len(evs)}):")
-    for ev in evs[:events]:
-        console.print(f"  {ev.get('timestamp', '')[:23]:23}  {ev.get('type')}")
+    else:
+        _print_run_summary(summary)
 
 
 @app.command()
 def analyze(
     selector: str | None = typer.Argument(
         None,
-        help=(
-            "Run selector from `list`: run id/prefix for legacy runs, Pier job for that "
-            "job's latest run, or Pier job/run id."
-        ),
+        help="Pier run selector from `list`: job, run id/prefix, or job/run.",
     ),
-    variant: str | None = typer.Option(None, "--variant", help="Variant slug (default: first)."),
-    task: str | None = typer.Option(None, "--task", help="Task slug (default: first)."),
-    trial: int | None = typer.Option(None, "--trial", help="Trial number (default: first)."),
+    agent: str | None = typer.Option(None, "--agent", help="Agent selector."),
+    task: str | None = typer.Option(None, "--task", help="Task selector."),
+    trial: str | None = typer.Option(None, "--trial", help="Trial number or Pier trial name."),
     file: Path | None = typer.Option(
-        None, "--file", help="Analyze an events.jsonl file directly (ignores run/variant/trial)."
+        None, "--file", help="Analyze an events.jsonl file directly (ignores run filters)."
     ),
     otel_file: Path | None = typer.Option(
         None, "--otel-file", help="Optional Copilot OTel JSONL file to enrich analysis."
     ),
-    last: bool = typer.Option(False, "--last", help="Analyze the most recent stored run."),
+    last: bool = typer.Option(False, "--last", help="Analyze the most recent stored Pier run."),
     max_turns: int = typer.Option(0, "--max-turns", help="Limit timeline rows (0 = all)."),
     root: Path | None = typer.Option(None, "--root", help="Experiment repository root."),
 ) -> None:
-    """Analyze a captured session log and render a rich overview of what happened."""
+    """Analyze a captured Copilot CLI session from a Pier trial."""
+
     if file is not None:
         events = load_events(file)
         if not events:
@@ -657,331 +420,275 @@ def analyze(
         )
         return
 
-    root = Path(root or Path.cwd())
-    layout = Layout(root)
-    resolved = _resolve_run(layout, last=last, selector=selector)
-    if resolved is None:
-        _print_run_not_found(selector, file_hint=True)
+    resolved = _resolve_or_exit(root, selector, last=last, file_hint=True)
+    rows = _matching_trial_rows(resolved.path, agent=agent, task=task, trial=trial)
+    if not rows:
+        err.print("[red]No matching Pier trials.[/red]")
+        _print_trial_filter_hint()
+        raise typer.Exit(1)
+    if len(rows) > 1:
+        err.print("[red]Multiple Pier trials match.[/red]")
+        _print_trials_table(rows, title="Matching trials")
+        err.print("[dim]Add --agent, --task, and/or --trial to select exactly one trial.[/dim]")
         raise typer.Exit(1)
-    if resolved.kind == "pier":
-        source_path, label, source_kind, discovered_otel = resolve_pier_trial_analysis_source(
-            resolved.path, trial
-        )
-        if source_path is None:
-            err.print(f"[red]No Copilot session log or trajectory found in[/red] {resolved.path}")
-            diagnostic = describe_missing_pier_analysis_source(resolved.path, trial)
-            if diagnostic:
-                err.print(f"[yellow]{diagnostic}[/yellow]")
-            raise typer.Exit(1)
-        selected_otel = otel_file or discovered_otel
-        analysis = (
-            analyze_events(
-                load_events(source_path),
-                load_events(selected_otel) if selected_otel is not None else None,
-            )
-            if source_kind == "events"
-            else analyze_trajectory(read_json(source_path))
-        )
-        render_session_analysis(analysis, console, title=label, max_turns=max_turns)
-        return
-    run_dir = resolved.path
 
-    events_path, label, discovered_otel = _resolve_trial_events(run_dir, variant, task, trial)
-    if events_path is None:
-        err.print(f"[red]No trial session log found in[/red] {run_dir}")
+    row = rows[0]
+    source_path, label, source_kind, discovered_otel = resolve_pier_trial_analysis_source(
+        resolved.path, row["trial_dir"]
+    )
+    if source_path is None:
+        err.print(f"[red]No Copilot session log or trajectory found in[/red] {resolved.path}")
+        diagnostic = describe_missing_pier_analysis_source(resolved.path, row["trial_dir"])
+        if diagnostic:
+            err.print(f"[yellow]{diagnostic}[/yellow]")
         raise typer.Exit(1)
 
     selected_otel = otel_file or discovered_otel
-    render_session_analysis(
+    analysis = (
         analyze_events(
-            load_events(events_path),
+            load_events(source_path),
             load_events(selected_otel) if selected_otel is not None else None,
-        ),
-        console,
-        title=label,
-        max_turns=max_turns,
+        )
+        if source_kind == "events"
+        else analyze_trajectory(read_json(source_path))
     )
+    render_session_analysis(analysis, console, title=label, max_turns=max_turns)
+
+
+def _require_pier_specs(root: Path, *, name: str | None = None) -> list[PierJobSpec]:
+    specs = discover_pier_job_configs(root, name=name)
+    if specs:
+        return specs
+    target = f" matching {name!r}" if name else ""
+    err.print(f"[red]No Pier job configs{target} found in[/red] {root / 'experiments'}")
+    err.print("[dim]Create one with `copilot-experiments init` or `deepswe-import`.[/dim]")
+    raise typer.Exit(1)
+
+
+def _validate_pier_specs(specs: list[PierJobSpec]) -> list[ValidationCheck]:
+    checks: list[ValidationCheck] = []
+    for spec in specs:
+        prefix = spec.name
+        task_count = len(spec.config.tasks) + len(spec.config.datasets)
+        agent_count = len(spec.config.agents)
+        checks.append(
+            ValidationCheck(
+                f"{prefix}: agents",
+                agent_count > 0,
+                f"{agent_count} configured" if agent_count else "no agents configured",
+            )
+        )
+        checks.append(
+            ValidationCheck(
+                f"{prefix}: tasks",
+                task_count > 0,
+                f"{task_count} configured" if task_count else "no tasks or datasets configured",
+            )
+        )
+        for path in _local_task_paths(spec):
+            checks.append(
+                ValidationCheck(
+                    f"{prefix}: path {path.name}",
+                    path.exists(),
+                    str(path) if path.exists() else f"missing: {path}",
+                )
+            )
+        try:
+            preflight_pier_backend(spec.config)
+        except PierBackendPreflightError as exc:
+            checks.append(ValidationCheck(f"{prefix}: backend", False, str(exc)))
+        else:
+            checks.append(ValidationCheck(f"{prefix}: backend", True, "preflight OK"))
 
+    if all(check.ok for check in checks):
+        try:
+            auth = preflight_github_token()
+        except AuthError as exc:
+            checks.append(ValidationCheck("auth", False, str(exc)))
+        else:
+            checks.append(ValidationCheck("auth", True, f"using {auth.source}"))
+    return checks
+
+
+def _local_task_paths(spec: PierJobSpec) -> list[Path]:
+    paths: list[Path] = []
+    for item in [*spec.config.tasks, *spec.config.datasets]:
+        path = getattr(item, "path", None)
+        if path is not None:
+            paths.append(Path(path))
+    return paths
+
+
+def _print_job_config_table(root: Path, specs: list[PierJobSpec]) -> None:
+    table = Table(title="Pier job configs", show_edge=False)
+    table.add_column("job")
+    table.add_column("config")
+    table.add_column("tasks", justify="right")
+    table.add_column("agents", justify="right")
+    for spec in specs:
+        table.add_row(
+            spec.name,
+            str(spec.path.relative_to(root)) if spec.path.is_relative_to(root) else str(spec.path),
+            str(len(spec.config.tasks) + len(spec.config.datasets)),
+            str(len(spec.config.agents)),
+        )
+    console.print(table)
 
-@app.command()
-def reindex(
-    root: Path | None = typer.Option(None, "--root", help="Experiment repository root."),
-) -> None:
-    """Rebuild ``results/index.db`` by scanning the filesystem."""
+
+def _print_validation_checks(checks: list[ValidationCheck]) -> None:
+    table = Table(title="Validation")
+    table.add_column("")
+    table.add_column("check")
+    table.add_column("detail", style="dim")
+    for check in checks:
+        mark = "[green]✓[/green]" if check.ok else "[red]✗[/red]"
+        table.add_row(mark, check.name, check.detail)
+    console.print(table)
+
+
+def _resolve_or_exit(
+    root: Path | None,
+    selector: str | None,
+    *,
+    last: bool,
+    file_hint: bool = False,
+) -> ResolvedRun:
     root = Path(root or Path.cwd())
     layout = Layout(root)
-    count = index_reindex(layout)
-    console.print(f"[green]Reindexed {count} run(s)[/green] -> {layout.index_db}")
-
-
-# --------------------------------------------------------------------------- #
-# Helpers
-# --------------------------------------------------------------------------- #
-def _resolve_trial_events(
-    run_dir: Path, variant: str | None, task: str | None, trial: int | None
-) -> tuple[Path | None, str, Path | None]:
-    """Locate a trial's ``events.jsonl``, defaulting to the first variant/task/trial."""
-    variants_dir = run_dir / "variants"
-    if variant is not None:
-        vdir = variants_dir / variant
-    else:
-        subdirs = (
-            sorted(p for p in variants_dir.iterdir() if p.is_dir()) if variants_dir.is_dir() else []
-        )
-        if not subdirs:
-            return None, run_dir.name, None
-        vdir = subdirs[0]
-
-    tasks_dir = vdir / "tasks"
-    if task is not None:
-        tkdir = tasks_dir / task
-    else:
-        subdirs = sorted(p for p in tasks_dir.iterdir() if p.is_dir()) if tasks_dir.is_dir() else []
-        if not subdirs:
-            return None, f"{run_dir.name} · {vdir.name}", None
-        tkdir = subdirs[0]
-
-    trials_dir = tkdir / "trials"
-    if trial is not None:
-        tdir = trials_dir / f"{trial:03d}"
-    else:
-        subdirs = (
-            sorted(p for p in trials_dir.iterdir() if p.is_dir()) if trials_dir.is_dir() else []
-        )
-        if not subdirs:
-            return None, f"{run_dir.name} · {vdir.name}/{tkdir.name}", None
-        tdir = subdirs[0]
-
-    label = f"{run_dir.name} · {vdir.name}/{tkdir.name}/{tdir.name}"
-    events_path = tdir / "events.jsonl"
-    otel_path = tdir / "copilot-otel.jsonl"
-    return (
-        events_path if events_path.exists() else None,
-        label,
-        otel_path if otel_path.exists() else None,
-    )
+    resolved = _resolve_run(layout, last=last, selector=selector)
+    if resolved is None:
+        _print_run_not_found(selector, file_hint=file_hint)
+        raise typer.Exit(1)
+    return resolved
 
 
 def _resolve_run(layout: Layout, *, last: bool, selector: str | None) -> ResolvedRun | None:
     if last:
-        return _latest_resolved_run(layout)
+        latest = layout.latest_pier_job()
+        return ResolvedRun(latest, pier_job_label(latest)) if latest else None
     if selector is None:
         return None
-
-    legacy = layout.find_run(selector)
-    pier = layout.find_pier_job(selector)
-    if pier is not None and (legacy is None or "/" in selector):
-        return ResolvedRun("pier", pier, pier_job_label(pier))
-    if legacy is not None:
-        return ResolvedRun("legacy", legacy, legacy.name)
-    if pier is not None:
-        return ResolvedRun("pier", pier, pier_job_label(pier))
-    return None
-
-
-def _latest_resolved_run(layout: Layout) -> ResolvedRun | None:
-    candidates: list[tuple[str, str, ResolvedRun]] = []
-    for _experiment_slug, run_id, run_dir in layout.iter_runs():
-        candidates.append(
-            (_legacy_run_started_at(run_dir), run_id, ResolvedRun("legacy", run_dir, run_id))
-        )
-    for job_dir in layout.iter_pier_jobs():
-        selector = pier_job_label(job_dir)
-        candidates.append(
-            (_pier_run_started_at(job_dir), selector, ResolvedRun("pier", job_dir, selector))
-        )
-    if not candidates:
-        return None
-    return max(candidates, key=lambda candidate: (candidate[0], candidate[1]))[2]
-
-
-def _legacy_run_started_at(run_dir: Path) -> str:
-    summary_path = run_dir / "summary.json"
-    run_path = run_dir / "run.json"
-    if summary_path.exists():
-        return str(read_json(summary_path).get("started_at") or "")
-    if run_path.exists():
-        return str(read_json(run_path).get("started_at") or "")
-    return ""
-
-
-def _pier_run_started_at(job_dir: Path) -> str:
-    result_path = job_dir / "result.json"
-    if result_path.exists():
-        return str(read_json(result_path).get("started_at") or "")
-    return ""
+    run = layout.find_pier_job(selector)
+    return ResolvedRun(run, pier_job_label(run)) if run else None
 
 
 def _print_run_not_found(selector: str | None, *, file_hint: bool = False) -> None:
     if selector:
-        err.print(f"[red]Run not found:[/red] {selector!r}")
+        err.print(f"[red]Pier run not found:[/red] {selector!r}")
     else:
-        err.print("[red]Run not found.[/red] Pass a run selector or --last.")
+        err.print("[red]Pier run not found.[/red] Pass a run selector or --last.")
     hints = [
         "Use `copilot-experiments list` to copy a selector.",
-        "Pier selectors look like `job-name/run-id`; `job-name` selects that job's latest run.",
+        "Selectors look like `job-name/run-id`; `job-name` selects that job's latest run.",
     ]
     if file_hint:
         hints.append("Use `--file path/to/events.jsonl` to analyze a session log directly.")
     err.print("[dim]" + " ".join(hints) + "[/dim]")
 
 
-def _print_dry_run_report(report: DryRunReport) -> None:
-    table = Table(title=f"Dry-run · {report.experiment}", show_lines=False)
-    table.add_column("", justify="center", width=3)
-    table.add_column("check")
-    table.add_column("detail", style="dim")
-    for c in report.checks:
-        mark = "[green]✓[/green]" if c.ok else "[red]✗[/red]"
-        table.add_row(mark, c.name, c.detail)
-    console.print(table)
-    tail = "[dim]— nothing persisted (temp dir removed)[/dim]\n"
-    if report.ok:
-        console.print(f"[green]plumbing OK[/green] {tail}")
-    else:
-        console.print(f"[red]plumbing FAILED[/red] {tail}")
-
-
-def _make_progress() -> Callable[[str], None]:
-    """Return a progress sink for ``run --verbose``.
-
-    Each line is printed dimmed. ``markup=False`` keeps Copilot's raw output and the
-    ``[variant/NNN]`` phase tags from being interpreted as Rich markup.
-    """
-
-    def _emit(msg: str) -> None:
-        console.print(msg, style="dim", markup=False, highlight=False)
-
-    return _emit
-
+def _matching_trial_rows(
+    job_dir: Path,
+    *,
+    agent: str | None = None,
+    task: str | None = None,
+    trial: str | None = None,
+) -> list[dict]:
+    rows = iter_pier_trial_summaries(job_dir)
+    filtered = []
+    for index, row in enumerate(rows, start=1):
+        if agent and not _matches_agent(row, agent):
+            continue
+        if task and task not in {row.get("task"), row.get("task_name")}:
+            continue
+        if trial and not _matches_trial(
+            row,
+            trial,
+            overall_index=index,
+            filtered=bool(agent or task),
+        ):
+            continue
+        filtered.append(row)
+    return filtered
 
-def _make_copilot_stream() -> Callable[[str], None]:
-    """Return a live Copilot-output sink for ``run --verbose``.
 
-    Copilot's ``--output-format json`` stream is a firehose of JSON events; a stateful
-    :class:`~copilot_experiments.render.LiveEventFormatter` condenses each into a short,
-    ASCII-tagged line (turns, messages, tool calls). Unparseable lines fall back to raw
-    text; pure-noise events are dropped. Output is indented under the phase messages.
-    """
-    from .render import LiveEventFormatter
+def _matches_agent(row: dict, selector: str) -> bool:
+    candidates = {str(row.get("agent") or ""), str(row.get("agent_name") or "")}
+    return selector in candidates or any(candidate.startswith(selector) for candidate in candidates)
 
-    formatter = LiveEventFormatter()
 
-    def _emit(line: str) -> None:
-        rendered = formatter.format(line)
-        if rendered is not None:
-            console.print(f"    {rendered}", style="dim", markup=False, highlight=False)
+def _matches_trial(row: dict, selector: str, *, overall_index: int, filtered: bool) -> bool:
+    if selector.isdigit():
+        number = int(selector)
+        if filtered:
+            return row.get("trial_no") == number
+        return overall_index == number
+    return selector in {str(row.get("trial_dir") or ""), str(row.get("trial_name") or "")}
 
-    return _emit
 
+def _print_trials_table(rows: list[dict], *, title: str) -> None:
+    table = Table(title=title)
+    table.add_column("trial")
+    table.add_column("agent")
+    table.add_column("task")
+    table.add_column("attempt", justify="right")
+    table.add_column("status")
+    table.add_column("success")
+    table.add_column("analysis")
+    for row in rows:
+        table.add_row(
+            str(row.get("trial_dir") or row.get("trial_name") or "-"),
+            str(row.get("agent") or "-"),
+            str(row.get("task") or "-"),
+            str(row.get("trial_no") or "-"),
+            str(row.get("status") or "-"),
+            _yes_no(row.get("success")),
+            "yes" if row.get("metrics") else "-",
+        )
+    console.print(table)
 
-def _warn_failed_trials(layout: Layout, experiment: Experiment, run: ExperimentRun) -> None:
-    """Loudly flag trials that did not run cleanly, with a pointer to diagnose.
 
-    The summary table still renders a row for a Copilot invocation that errored out
-    immediately (e.g. bad auth or a bad working directory) -- just with zero turns.
-    That makes a broken run look deceptively clean. We surface harness/infra failures
-    explicitly, classify them (harness vs copilot), and point at the exact artifact to
-    inspect (its ``stdout.txt``).
-    """
-    problems: list[str] = []
-    for vr in run.variants:
-        for tr in vr.tasks:
-            for trial in tr.trials:
-                if not trial.failed:
-                    continue
-                trial_dir = layout.trial_dir(
-                    experiment.slug, run.run_id, vr.variant.slug, tr.task_slug, trial.trial_no
-                )
-                label = (
-                    "harness failure" if trial.status == "harness_error" else "copilot did not run"
-                )
-                detail = trial.error or trial.status
-                artifact = trial.error_artifact or "stdout.txt"
-                problems.append(
-                    f"  {vr.variant.slug}/{tr.task_slug}/{trial.trial_no:03d}: "
-                    f"{label} — {detail}\n"
-                    f"      -> {trial_dir / artifact}"
-                )
-    if not problems:
-        return
+def _print_trial_filter_hint() -> None:
     err.print(
-        f"[yellow]Warning:[/yellow] run status [bold]{run.status}[/bold] — "
-        f"{len(problems)} trial(s) failed in the harness (not the experiment). "
-        "Inspect the captured output:"
+        "[dim]Use `copilot-experiments inspect <job/run>` to see agents, tasks, "
+        "and trial names.[/dim]"
     )
-    for line in problems:
-        err.print(f"[yellow]{line}[/yellow]")
 
 
 def _warn_failed_pier_trials(job_dir: Path) -> None:
-    """Point Pier harness failures at the trial result artifact."""
-
-    problems: list[str] = []
-    for trial in iter_pier_trial_summaries(job_dir):
-        if trial.get("status") == "ok":
-            continue
-        trial_name = str(trial.get("trial_name") or trial.get("trial_no") or "-")
-        problems.append(
-            f"  {trial_name}: harness failure — {trial.get('error') or trial.get('status')}\n"
-            f"      -> {job_dir / trial_name / 'result.json'}"
-        )
-    if not problems:
+    failed = [row for row in iter_pier_trial_summaries(job_dir) if row.get("status") != "ok"]
+    if not failed:
         return
-    err.print(
-        f"[yellow]Warning:[/yellow] Pier job [bold]{pier_job_label(job_dir)}[/bold] had "
-        f"{len(problems)} harness failure(s). Inspect the captured trial result:"
-    )
-    for line in problems:
-        err.print(f"[yellow]{line}[/yellow]")
-
-
-def _inspect_pier_job(job_dir: Path) -> None:
-    summary = write_pier_summary(job_dir)
-    console.print(f"[bold]Pier job[/bold]: {pier_job_label(job_dir)}")
-    console.print(f"[bold]summary[/bold]: {job_dir / 'summary.json'}")
-    _print_run_summary(summary)
-
-    table = Table(title=f"Trials in {pier_job_label(job_dir)}")
+    table = Table(title="Failed Pier trials")
     table.add_column("trial")
+    table.add_column("agent")
+    table.add_column("task")
     table.add_column("status")
-    table.add_column("success")
-    table.add_column("analysis")
-    for trial_dir in sorted(
-        path for path in job_dir.iterdir() if path.is_dir() and (path / "result.json").exists()
-    ):
-        result = read_json(trial_dir / "result.json")
-        exception = result.get("exception_info")
-        rewards = (result.get("verifier_result") or {}).get("rewards") or {}
-        success = "-"
-        if rewards:
-            success = "yes" if any(float(value) > 0 for value in rewards.values()) else "no"
-        source_path, _label, source_kind, _otel_path = resolve_pier_trial_analysis_source(
-            job_dir, trial_dir.name
-        )
+    table.add_column("error")
+    for row in failed:
         table.add_row(
-            trial_dir.name,
-            "harness_error" if exception else "ok",
-            success,
-            source_kind or ("yes" if source_path else "no"),
+            str(row.get("trial_name") or "-"),
+            str(row.get("agent") or "-"),
+            str(row.get("task") or "-"),
+            str(row.get("status") or "-"),
+            str(row.get("error") or "-"),
         )
     console.print(table)
 
 
 def _print_run_summary(summary: dict) -> None:
-    sr = summary.get("overall_success_rate")
-    n_tasks = summary.get("n_tasks", 1)
-    multitask = n_tasks > 1
-    title = (
-        f"{summary['experiment']}  ·  {summary['run_id']}  ·  "
-        f"{n_tasks} task(s) · {summary['n_trials']} trial(s)  ·  "
-        f"success {'-' if sr is None else f'{sr * 100:.0f}%'}"
+    console.print(
+        f"[bold]{summary['job']}[/bold] · run [cyan]{summary['run_id']}[/cyan] · "
+        f"status={summary.get('status', '-')}"
     )
-    table = Table(title=title)
-    table.add_column("variant")
+    console.print(
+        f"agents={summary['n_agents']} tasks={summary.get('n_tasks', 0)} "
+        f"trials={summary['n_trials']} success={_pct(summary.get('overall_success_rate'))}"
+    )
+    multitask = summary.get("n_tasks", 0) > 1
+    table = Table(title="Agents")
+    table.add_column("agent")
     table.add_column("model")
     table.add_column("effort")
-    table.add_column("byok")
     if multitask:
         table.add_column("tasks", justify="right")
     table.add_column("trials", justify="right")
@@ -996,46 +703,43 @@ def _print_run_summary(summary: dict) -> None:
     table.add_column("tokens", justify="right")
     table.add_column("AIU", justify="right")
     table.add_column("AIU/solve", justify="right")
-    for v in summary["variants"]:
-        vsr = v.get("success_rate")
-        ms = v.get("mean_resolved_rate")
-        rk = v.get("resolved_at_k_rate")
+    for agent in summary["agents"]:
         row = [
-            v["name"],
-            v.get("model") or "-",
-            v.get("reasoning_effort") or "-",
-            "yes" if v.get("byok") else "no",
+            agent["name"],
+            agent.get("model") or "-",
+            agent.get("reasoning_effort") or "-",
         ]
         if multitask:
-            row.append(str(v.get("n_tasks", "-")))
-        row.append(str(v["n_trials"]))
-        row.append("-" if vsr is None else f"{vsr * 100:.0f}%")
+            row.append(str(agent.get("n_tasks", "-")))
+        row.append(str(agent["n_trials"]))
+        row.append(_pct(agent.get("success_rate")))
         if multitask:
-            row.append("-" if ms is None else f"{ms * 100:.0f}%")
-            row.append("-" if rk is None else f"{rk * 100:.0f}%")
+            row.append(_pct(agent.get("mean_resolved_rate")))
+            row.append(_pct(agent.get("resolved_at_k_rate")))
         row += [
-            _num(v.get("avg_duration_s")),
-            _num(v.get("avg_turns")),
-            _num(v.get("avg_tool_calls")),
-            _num(v.get("avg_tool_failures")),
-            _num(v.get("avg_total_tokens")),
-            _aiu(v.get("avg_aiu")),
-            _aiu(v.get("aiu_per_solve")),
+            _num(agent.get("avg_duration_s")),
+            _num(agent.get("avg_turns")),
+            _num(agent.get("avg_tool_calls")),
+            _num(agent.get("avg_tool_failures")),
+            _num(agent.get("avg_total_tokens")),
+            _aiu(agent.get("avg_aiu")),
+            _aiu(agent.get("aiu_per_solve")),
         ]
         table.add_row(*row)
+    console.print(table)
     total_aiu = summary.get("total_aiu")
     if total_aiu is not None:
-        console.print(table)
         console.print(f"[dim]total cost:[/dim] {_aiu(total_aiu)} AIU")
-        return
-    console.print(table)
 
 
-def _aiu(value: object) -> str:
+def _yes_no(value: object) -> str:
     if value is None:
         return "-"
-    val = float(value)
-    return f"{val:.3f}" if val < 1 else f"{val:,.2f}"
+    return "yes" if value else "no"
+
+
+def _pct(value: float | None) -> str:
+    return "-" if value is None else f"{value * 100:.0f}%"
 
 
 def _num(value: object) -> str:
@@ -1046,5 +750,11 @@ def _num(value: object) -> str:
     return str(value)
 
 
-if __name__ == "__main__":
+def _aiu(value: object) -> str:
+    if value is None:
+        return "-"
+    return f"{float(value):.3f}" if float(value) < 1 else f"{float(value):,.2f}"
+
+
+if __name__ == "__main__":  # pragma: no cover
     app()
diff --git a/src/copilot_experiments/index.py b/src/copilot_experiments/index.py
deleted file mode 100644
index 16417b2..0000000
--- a/src/copilot_experiments/index.py
+++ /dev/null
@@ -1,356 +0,0 @@
-"""SQLite index over the ``results/`` filesystem for cross-run queries.
-
-The filesystem is the source of truth; the database is a derived, rebuildable
-index. :func:`reindex` drops and rebuilds it by scanning ``results/``.
-"""
-
-from __future__ import annotations
-
-import json
-import sqlite3
-from pathlib import Path
-
-from ._util import read_json
-from .pier_results import build_pier_summary, iter_pier_trial_summaries, pier_job_identity
-from .storage import Layout
-
-SCHEMA = """
-CREATE TABLE IF NOT EXISTS experiments (
-    slug         TEXT PRIMARY KEY,
-    name         TEXT,
-    description  TEXT,
-    first_seen   TEXT
-);
-CREATE TABLE IF NOT EXISTS runs (
-    run_id           TEXT PRIMARY KEY,
-    experiment_slug  TEXT,
-    started_at       TEXT,
-    finished_at      TEXT,
-    git_base         TEXT,
-    n_variants       INTEGER,
-    status           TEXT
-);
-CREATE TABLE IF NOT EXISTS variants (
-    id            INTEGER PRIMARY KEY AUTOINCREMENT,
-    run_id        TEXT,
-    variant_slug  TEXT,
-    model         TEXT,
-    reasoning_effort TEXT,
-    agent         TEXT,
-    mode          TEXT,
-    byok          INTEGER,
-    params_json   TEXT
-);
-CREATE TABLE IF NOT EXISTS tasks (
-    id            INTEGER PRIMARY KEY AUTOINCREMENT,
-    run_id        TEXT,
-    variant_slug  TEXT,
-    task_slug     TEXT,
-    task_name     TEXT,
-    n_trials      INTEGER,
-    success_rate  REAL,
-    resolved      INTEGER
-);
-CREATE TABLE IF NOT EXISTS trials (
-    id              INTEGER PRIMARY KEY AUTOINCREMENT,
-    run_id          TEXT,
-    variant_slug    TEXT,
-    task_slug       TEXT,
-    trial_no        INTEGER,
-    session_id      TEXT,
-    exit_code       INTEGER,
-    duration_s      REAL,
-    success         INTEGER,
-    n_turns         INTEGER,
-    n_tool_calls    INTEGER,
-    n_tool_failures INTEGER,
-    input_tokens    INTEGER,
-    output_tokens   INTEGER,
-    total_tokens    INTEGER,
-    cache_read_tokens     INTEGER,
-    cache_write_tokens    INTEGER,
-    input_tokens_noncached INTEGER,
-    reasoning_tokens      INTEGER,
-    aiu             REAL,
-    api_duration_ms INTEGER,
-    n_requests      INTEGER,
-    peak_context_tokens   INTEGER,
-    n_compactions   INTEGER,
-    n_truncations   INTEGER,
-    files_modified  INTEGER,
-    lines_added     INTEGER,
-    lines_removed   INTEGER,
-    model           TEXT,
-    status          TEXT,
-    error           TEXT
-);
-CREATE TABLE IF NOT EXISTS pier_jobs (
-    id             TEXT PRIMARY KEY,
-    job_name       TEXT,
-    run_id         TEXT,
-    job_dir        TEXT,
-    started_at     TEXT,
-    finished_at    TEXT,
-    n_trials       INTEGER,
-    success_rate   REAL,
-    status         TEXT
-);
-CREATE TABLE IF NOT EXISTS pier_trials (
-    id             INTEGER PRIMARY KEY AUTOINCREMENT,
-    job_id         TEXT,
-    job_name       TEXT,
-    run_id         TEXT,
-    variant_slug   TEXT,
-    task_slug      TEXT,
-    trial_name     TEXT,
-    success        INTEGER,
-    status         TEXT,
-    n_turns        INTEGER,
-    n_tool_calls   INTEGER,
-    total_tokens   INTEGER,
-    aiu            REAL,
-    model          TEXT,
-    error          TEXT
-);
-"""
-
-# Columns added after the initial schema. ``connect`` ALTERs any that a pre-existing
-# index.db is missing (the index is a derived cache, but this avoids a forced reindex).
-_TRIAL_MIGRATIONS = {
-    "status": "ALTER TABLE trials ADD COLUMN status TEXT",
-    "error": "ALTER TABLE trials ADD COLUMN error TEXT",
-}
-
-
-def _migrate(conn: sqlite3.Connection) -> None:
-    existing = {row["name"] for row in conn.execute("PRAGMA table_info(trials)")}
-    for column, ddl in _TRIAL_MIGRATIONS.items():
-        if column not in existing:
-            conn.execute(ddl)
-
-    pier_job_columns = {row["name"] for row in conn.execute("PRAGMA table_info(pier_jobs)")}
-    pier_trial_columns = {row["name"] for row in conn.execute("PRAGMA table_info(pier_trials)")}
-    if (
-        pier_job_columns
-        and {"id", "run_id"} - pier_job_columns
-        or pier_trial_columns
-        and {"job_id", "run_id"} - pier_trial_columns
-    ):
-        conn.execute("DROP TABLE IF EXISTS pier_trials")
-        conn.execute("DROP TABLE IF EXISTS pier_jobs")
-        conn.executescript(SCHEMA)
-
-
-def connect(db_path: Path) -> sqlite3.Connection:
-    db_path.parent.mkdir(parents=True, exist_ok=True)
-    conn = sqlite3.connect(str(db_path))
-    conn.row_factory = sqlite3.Row
-    conn.executescript(SCHEMA)
-    _migrate(conn)
-    return conn
-
-
-def index_run_dir(conn: sqlite3.Connection, run_dir: Path) -> None:
-    """Insert (or replace) one stored run into the index."""
-    run = read_json(run_dir / "run.json")
-    run_id = run["run_id"]
-    slug = run["experiment_slug"]
-
-    conn.execute(
-        "INSERT OR IGNORE INTO experiments(slug, name, description, first_seen) VALUES (?,?,?,?)",
-        (
-            slug,
-            run.get("experiment_name"),
-            run.get("experiment_description"),
-            run.get("started_at"),
-        ),
-    )
-    conn.execute("DELETE FROM runs WHERE run_id=?", (run_id,))
-    conn.execute("DELETE FROM variants WHERE run_id=?", (run_id,))
-    conn.execute("DELETE FROM tasks WHERE run_id=?", (run_id,))
-    conn.execute("DELETE FROM trials WHERE run_id=?", (run_id,))
-
-    variants = run.get("variants", [])
-    conn.execute(
-        "INSERT INTO runs(run_id, experiment_slug, started_at, finished_at, git_base, "
-        "n_variants, status) VALUES (?,?,?,?,?,?,?)",
-        (
-            run_id,
-            slug,
-            run.get("started_at"),
-            run.get("finished_at"),
-            run.get("git_base"),
-            len(variants),
-            run.get("status"),
-        ),
-    )
-
-    for vr in variants:
-        v = vr["variant"]
-        vslug = v.get("slug") or v.get("name")
-        conn.execute(
-            "INSERT INTO variants(run_id, variant_slug, model, reasoning_effort, agent, mode, "
-            "byok, params_json) VALUES (?,?,?,?,?,?,?,?)",
-            (
-                run_id,
-                vslug,
-                v.get("model"),
-                v.get("reasoning_effort"),
-                v.get("agent"),
-                v.get("mode"),
-                1 if v.get("provider") else 0,
-                json.dumps(v),
-            ),
-        )
-        for tr in vr.get("tasks", []):
-            task_slug = tr.get("task_slug")
-            trials = tr.get("trials", [])
-            graded = [t for t in trials if t.get("success") is not None]
-            n_solved = sum(1 for t in graded if t.get("success"))
-            conn.execute(
-                "INSERT INTO tasks(run_id, variant_slug, task_slug, task_name, n_trials, "
-                "success_rate, resolved) VALUES (?,?,?,?,?,?,?)",
-                (
-                    run_id,
-                    vslug,
-                    task_slug,
-                    tr.get("task_name"),
-                    len(trials),
-                    (n_solved / len(graded)) if graded else None,
-                    None if not graded else int(any(t.get("success") for t in graded)),
-                ),
-            )
-            for trial in trials:
-                m = trial.get("metrics", {})
-                models = m.get("models") or []
-                conn.execute(
-                    "INSERT INTO trials(run_id, variant_slug, task_slug, trial_no, session_id, "
-                    "exit_code, duration_s, success, n_turns, n_tool_calls, n_tool_failures, "
-                    "input_tokens, output_tokens, total_tokens, cache_read_tokens, "
-                    "cache_write_tokens, input_tokens_noncached, reasoning_tokens, aiu, "
-                    "api_duration_ms, n_requests, peak_context_tokens, n_compactions, "
-                    "n_truncations, files_modified, lines_added, lines_removed, model, "
-                    "status, error) "
-                    "VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
-                    (
-                        run_id,
-                        vslug,
-                        task_slug,
-                        trial.get("trial_no"),
-                        trial.get("session_id"),
-                        trial.get("exit_code"),
-                        trial.get("duration_s"),
-                        None if trial.get("success") is None else int(bool(trial.get("success"))),
-                        m.get("n_turns"),
-                        m.get("n_tool_calls"),
-                        m.get("n_tool_failures"),
-                        m.get("input_tokens"),
-                        m.get("output_tokens"),
-                        m.get("total_tokens"),
-                        m.get("cache_read_tokens"),
-                        m.get("cache_write_tokens"),
-                        m.get("input_tokens_noncached"),
-                        m.get("reasoning_tokens"),
-                        m.get("aiu"),
-                        m.get("api_duration_ms"),
-                        m.get("n_requests"),
-                        m.get("peak_context_tokens"),
-                        m.get("n_compactions"),
-                        m.get("n_truncations"),
-                        m.get("files_modified"),
-                        m.get("lines_added"),
-                        m.get("lines_removed"),
-                        models[-1] if models else v.get("model"),
-                        trial.get("status"),
-                        trial.get("error"),
-                    ),
-                )
-    conn.commit()
-
-
-def index_pier_job_dir(conn: sqlite3.Connection, job_dir: Path) -> None:
-    """Insert (or replace) one Pier job into the derived index."""
-
-    summary = build_pier_summary(job_dir)
-    identity = pier_job_identity(job_dir)
-    job_id = identity["id"]
-    job_name = identity["job_name"]
-    run_id = identity["run_id"]
-    conn.execute("DELETE FROM pier_jobs WHERE id=?", (job_id,))
-    conn.execute("DELETE FROM pier_trials WHERE job_id=?", (job_id,))
-
-    conn.execute(
-        "INSERT INTO pier_jobs(id, job_name, run_id, job_dir, started_at, finished_at, "
-        "n_trials, success_rate, status) VALUES (?,?,?,?,?,?,?,?,?)",
-        (
-            job_id,
-            job_name,
-            run_id,
-            str(job_dir),
-            summary.get("started_at"),
-            summary.get("finished_at"),
-            summary.get("n_trials"),
-            summary.get("overall_success_rate"),
-            summary.get("status"),
-        ),
-    )
-
-    for trial in iter_pier_trial_summaries(job_dir):
-        metrics = trial.get("metrics") or {}
-        conn.execute(
-            "INSERT INTO pier_trials(job_id, job_name, run_id, variant_slug, task_slug, "
-            "trial_name, success, status, n_turns, n_tool_calls, total_tokens, aiu, model, "
-            "error) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?)",
-            (
-                job_id,
-                job_name,
-                run_id,
-                trial.get("variant"),
-                trial.get("task"),
-                trial.get("trial_name"),
-                None if trial.get("success") is None else int(bool(trial.get("success"))),
-                trial.get("status"),
-                metrics.get("n_turns"),
-                metrics.get("n_tool_calls"),
-                metrics.get("total_tokens"),
-                metrics.get("aiu"),
-                trial.get("model"),
-                trial.get("error"),
-            ),
-        )
-    conn.commit()
-
-
-def reindex(layout: Layout) -> int:
-    """Rebuild the index from scratch by scanning legacy runs and Pier jobs."""
-    if layout.index_db.exists():
-        layout.index_db.unlink()
-    conn = connect(layout.index_db)
-    count = 0
-    try:
-        for _slug, _run_id, run_dir in layout.iter_runs():
-            index_run_dir(conn, run_dir)
-            count += 1
-        for job_dir in layout.iter_pier_jobs():
-            index_pier_job_dir(conn, job_dir)
-            count += 1
-    finally:
-        conn.close()
-    return count
-
-
-def list_runs(layout: Layout) -> list[dict]:
-    if not layout.index_db.exists():
-        reindex(layout)
-    conn = connect(layout.index_db)
-    try:
-        rows = conn.execute(
-            "SELECT r.*, "
-            "(SELECT COUNT(*) FROM trials t WHERE t.run_id=r.run_id) AS n_trials, "
-            "(SELECT AVG(success) FROM trials t WHERE t.run_id=r.run_id AND t.success IS NOT NULL)"
-            " AS success_rate "
-            "FROM runs r ORDER BY r.started_at"
-        ).fetchall()
-        return [dict(row) for row in rows]
-    finally:
-        conn.close()
diff --git a/src/copilot_experiments/invoker.py b/src/copilot_experiments/invoker.py
deleted file mode 100644
index 8e6bcc0..0000000
--- a/src/copilot_experiments/invoker.py
+++ /dev/null
@@ -1,482 +0,0 @@
-"""Build and execute the ``copilot`` command for a single trial.
-
-Two implementations are provided:
-
-* :class:`CopilotInvoker` shells out to the real ``copilot`` CLI.
-* :class:`MockInvoker` simulates a run by writing synthetic ``events.jsonl`` and
-  stdout, so the library, the runner, and experiment repos can be exercised
-  end-to-end without consuming Copilot credits or network access.
-"""
-
-from __future__ import annotations
-
-import datetime as _dt
-import json
-import os
-import subprocess
-import time
-from collections.abc import Callable
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Protocol
-
-from . import pricing
-from ._util import iso, utcnow
-from .models import Variant
-from .sessionlog import events_path
-
-
-@dataclass
-class Invocation:
-    prompt: str
-    workspace: Path
-    session_id: str
-    variant: Variant
-    log_dir: Path
-    stdout_path: Path
-    session_state_root: Path
-    env_overrides: dict[str, str] = field(default_factory=dict)
-    # Absolute path where Copilot should write its markdown session transcript
-    # (``--share``). Kept *outside* the workspace so it never pollutes the diff.
-    share_path: Path | None = None
-    # Environment variable names Copilot must redact from its output and strip from
-    # sub-shells (``--secret-env-vars``): the injected GitHub token and BYOK secrets.
-    secret_env_names: list[str] = field(default_factory=list)
-
-
-@dataclass
-class InvocationResult:
-    exit_code: int
-    duration_s: float
-
-
-class Invoker(Protocol):
-    def run(self, inv: Invocation) -> InvocationResult: ...
-
-
-def build_args(inv: Invocation) -> list[str]:
-    """Translate a variant + invocation into ``copilot`` CLI arguments."""
-    v = inv.variant
-    args: list[str] = [
-        "-p",
-        inv.prompt,
-        "--output-format",
-        "json",
-        "--session-id",
-        inv.session_id,
-        "--log-dir",
-        str(Path(inv.log_dir).resolve()),
-        "-C",
-        # Always an absolute path: Copilot chdirs into ``-C`` *after* the process
-        # cwd is already the workspace, so a relative value would be resolved
-        # against the workspace and doubled (ENAMETOOLONG on Windows).
-        str(Path(inv.workspace).resolve()),
-    ]
-    if v.allow_all_tools:
-        args.append("--allow-all-tools")
-    if v.model:
-        args += ["--model", v.model]
-    if v.reasoning_effort:
-        args += ["--effort", v.reasoning_effort]
-    if v.agent:
-        args += ["--agent", v.agent]
-    if v.mode:
-        args += ["--mode", v.mode]
-    for tool in v.allow_tools:
-        args += ["--allow-tool", tool]
-    for tool in v.deny_tools:
-        args += ["--deny-tool", tool]
-    # Redact injected token + BYOK secrets from Copilot's output (stdout, the shared
-    # markdown transcript) and strip them from any shell/MCP sub-environments. Passed
-    # as a single ``=``-joined token so the variadic option can't swallow later flags.
-    if inv.secret_env_names:
-        args.append(f"--secret-env-vars={','.join(inv.secret_env_names)}")
-    # Write a human-readable markdown transcript of the session after completion. An
-    # absolute path is required (and keeps it out of the workspace; ``--share`` would
-    # otherwise default to the cwd, which is the diffed workspace).
-    if inv.share_path is not None:
-        args.append(f"--share={Path(inv.share_path).resolve()}")
-    args += v.extra_args
-    return args
-
-
-def build_env(inv: Invocation) -> dict[str, str]:
-    env = dict(os.environ)
-    if inv.variant.provider is not None:
-        env.update(inv.variant.provider.to_env())
-    env.update(inv.variant.env)
-    env.update(inv.env_overrides)
-    return env
-
-
-class CopilotInvoker:
-    """Invoke the real Copilot CLI.
-
-    When ``stream`` is provided, Copilot's combined stdout/stderr is *teed*: every
-    line is both written to the capture file and forwarded to the callback, so the
-    CLI's ``--verbose`` mode can follow the run live. When it is ``None`` the output
-    is redirected straight to the file (the default, lowest-overhead path).
-    """
-
-    def __init__(
-        self, binary: str = "copilot", *, stream: Callable[[str], None] | None = None
-    ) -> None:
-        self.binary = binary
-        self.stream = stream
-
-    def run(self, inv: Invocation) -> InvocationResult:
-        inv.log_dir.mkdir(parents=True, exist_ok=True)
-        inv.stdout_path.parent.mkdir(parents=True, exist_ok=True)
-        args = [self.binary, *build_args(inv)]
-        env = build_env(inv)
-        # Always an absolute cwd for the same reason ``-C`` is absolute (see build_args).
-        cwd = str(Path(inv.workspace).resolve())
-        start = time.monotonic()
-        if self.stream is None:
-            exit_code = self._run_captured(args, cwd, env, inv.stdout_path)
-        else:
-            exit_code = self._run_streaming(args, cwd, env, inv.stdout_path)
-        duration = time.monotonic() - start
-        return InvocationResult(exit_code=exit_code, duration_s=duration)
-
-    def _run_captured(
-        self, args: list[str], cwd: str, env: dict[str, str], stdout_path: Path
-    ) -> int:
-        with stdout_path.open("w", encoding="utf-8") as out:
-            proc = subprocess.run(
-                args, cwd=cwd, env=env, stdout=out, stderr=subprocess.STDOUT, text=True
-            )
-        return proc.returncode
-
-    def _run_streaming(
-        self, args: list[str], cwd: str, env: dict[str, str], stdout_path: Path
-    ) -> int:
-        assert self.stream is not None
-        with stdout_path.open("w", encoding="utf-8") as out:
-            proc = subprocess.Popen(
-                args,
-                cwd=cwd,
-                env=env,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.STDOUT,
-                text=True,
-                encoding="utf-8",
-                errors="replace",
-                bufsize=1,
-            )
-            assert proc.stdout is not None
-            for line in proc.stdout:
-                out.write(line)
-                out.flush()
-                self.stream(line.rstrip("\n"))
-            return proc.wait()
-
-
-class MockInvoker:
-    """Simulate a Copilot run for testing and dry-runs.
-
-    Writes a small synthetic ``events.jsonl`` (so :mod:`sessionlog` parsing works)
-    and a matching stdout file. An optional ``solver`` callback may mutate the
-    workspace to emulate Copilot completing the task (useful in tests).
-    """
-
-    def __init__(
-        self,
-        *,
-        exit_code: int = 0,
-        solver: Callable[[Path], None] | None = None,
-        leave_note: bool = True,
-        turns: int = 4,
-    ) -> None:
-        self.exit_code = exit_code
-        self.solver = solver
-        self.leave_note = leave_note
-        self.turns = max(1, turns)
-
-    def run(self, inv: Invocation) -> InvocationResult:
-        model = inv.variant.model or "mock-model"
-        start = time.monotonic()
-
-        if self.solver is not None:
-            self.solver(inv.workspace)
-        elif self.leave_note:
-            (inv.workspace / "MOCK_RUN.md").write_text(
-                f"Mock Copilot run for variant '{inv.variant.name}'.\n", encoding="utf-8"
-            )
-
-        events = self._synthetic_events(inv, model)
-        dest = events_path(inv.session_id, inv.session_state_root)
-        dest.parent.mkdir(parents=True, exist_ok=True)
-        with dest.open("w", encoding="utf-8") as fh:
-            for ev in events:
-                fh.write(json.dumps(ev) + "\n")
-
-        inv.stdout_path.parent.mkdir(parents=True, exist_ok=True)
-        with inv.stdout_path.open("w", encoding="utf-8") as fh:
-            for ev in events:
-                fh.write(json.dumps(ev) + "\n")
-
-        otel_path = inv.env_overrides.get("COPILOT_OTEL_FILE_EXPORTER_PATH")
-        if otel_path:
-            self._write_synthetic_otel(Path(otel_path), events, model)
-
-        duration = time.monotonic() - start
-        return InvocationResult(exit_code=self.exit_code, duration_s=duration)
-
-    def _synthetic_events(self, inv: Invocation, model: str) -> list[dict]:
-        """Build a small but realistic, multi-turn ``events.jsonl`` (real schema).
-
-        Emits ``session.start`` / ``user.message`` and several assistant turns, each
-        invoking a tool, so that downstream metrics *and* the richer session analysis
-        have something meaningful to work with offline.
-        """
-        t0 = utcnow()
-        clock = {"n": 0}
-
-        def at() -> str:
-            clock["n"] += 1
-            return iso(t0 + _dt.timedelta(seconds=clock["n"] * 0.25))
-
-        session_id = inv.session_id
-        # A deterministic, varied tool script: one deliberate failure + recovery.
-        script = [
-            ("view", "Exploring the workspace to understand the task.", True),
-            ("edit", "Applying the change to fix the issue.", True),
-            ("powershell", "Running the verification command.", False),
-            ("powershell", "Re-running verification after the fix.", True),
-        ]
-        script = script[: self.turns]
-
-        events: list[dict] = [
-            {
-                "type": "session.start",
-                "timestamp": at(),
-                "data": {
-                    "sessionId": session_id,
-                    "producer": "mock",
-                    "copilotVersion": "mock-0",
-                    "selectedModel": model,
-                    "reasoningEffort": inv.variant.reasoning_effort,
-                    "context": {
-                        "cwd": str(inv.workspace),
-                        "branch": "mock",
-                        "repository": "mock/experiment",
-                    },
-                    "startTime": iso(t0),
-                },
-            },
-            {
-                "type": "user.message",
-                "timestamp": at(),
-                "data": {"content": inv.prompt},
-            },
-        ]
-
-        out_total = 0
-        lines_added = 0
-        lines_removed = 0
-        for i, (tool, text, ok) in enumerate(script):
-            call_id = f"mock-{i}"
-            out_tok = 40 + 10 * i
-            out_total += out_tok
-            tele_metrics: dict = {
-                "durationMs": 50 + 25 * i,
-                "resultForLlmLength": 200 + 50 * i,
-                "resultLength": 260 + 50 * i,
-            }
-            if tool == "edit":
-                tele_metrics["linesAdded"] = 5
-                tele_metrics["linesRemoved"] = 2
-                lines_added += 5
-                lines_removed += 2
-            if tool == "powershell":
-                tele_metrics["exit_code"] = 0 if ok else 1
-            events += [
-                {"type": "assistant.turn_start", "timestamp": at(), "data": {"turnId": str(i)}},
-                {
-                    "type": "assistant.message",
-                    "timestamp": at(),
-                    "data": {
-                        "model": model,
-                        "content": text,
-                        "turnId": str(i),
-                        "outputTokens": out_tok,
-                        "toolRequests": [{"toolCallId": call_id, "name": tool}],
-                    },
-                },
-                {
-                    "type": "tool.execution_start",
-                    "timestamp": at(),
-                    "data": {
-                        "toolCallId": call_id,
-                        "toolName": tool,
-                        "model": model,
-                        "turnId": str(i),
-                    },
-                },
-                {
-                    "type": "tool.execution_complete",
-                    "timestamp": at(),
-                    "data": {
-                        "toolCallId": call_id,
-                        "turnId": str(i),
-                        "success": ok,
-                        "toolTelemetry": {"metrics": tele_metrics},
-                    },
-                },
-                {"type": "assistant.turn_end", "timestamp": at(), "data": {"turnId": str(i)}},
-            ]
-
-        # A closing turn with a final message and no tool call.
-        final_turn = len(script)
-        out_total += 25
-        events += [
-            {
-                "type": "assistant.turn_start",
-                "timestamp": at(),
-                "data": {"turnId": str(final_turn)},
-            },
-            {
-                "type": "assistant.message",
-                "timestamp": at(),
-                "data": {
-                    "model": model,
-                    "turnId": str(final_turn),
-                    "outputTokens": 25,
-                    "content": f"(mock) Completed the task for variant '{inv.variant.name}'.",
-                },
-            },
-            {"type": "assistant.turn_end", "timestamp": at(), "data": {"turnId": str(final_turn)}},
-        ]
-
-        events += self._economics_events(model, at, out_total, lines_added, lines_removed)
-        return events
-
-    @staticmethod
-    def _write_synthetic_otel(path: Path, events: list[dict], model: str) -> None:
-        """Write minimal OTel file-exporter records for offline analysis tests."""
-
-        path.parent.mkdir(parents=True, exist_ok=True)
-        chat_events = [ev for ev in events if ev.get("type") == "assistant.message"]
-        with path.open("w", encoding="utf-8") as fh:
-            for i, ev in enumerate(chat_events):
-                data = ev.get("data") or {}
-                output_tokens = int(data.get("outputTokens") or 0)
-                input_tokens = 1000 + i * 100
-                cache_read_tokens = 900 + i * 90
-                cache_creation_tokens = 100 + i * 10
-                nano_aiu = 100_000_000 + i * 10_000_000
-                record = {
-                    "type": "span",
-                    "name": f"chat {model}",
-                    "attributes": {
-                        "gen_ai.operation.name": "chat",
-                        "gen_ai.request.model": model,
-                        "gen_ai.response.model": model,
-                        "gen_ai.usage.input_tokens": input_tokens,
-                        "gen_ai.usage.cache_read_input_tokens": cache_read_tokens,
-                        "gen_ai.usage.cache_creation_input_tokens": cache_creation_tokens,
-                        "gen_ai.usage.output_tokens": output_tokens,
-                        "github.copilot.nano_aiu": nano_aiu,
-                        "github.copilot.server_duration": 100 + i * 10,
-                        "github.copilot.turn_id": str(data.get("turnId") or i),
-                    },
-                    "events": [
-                        {
-                            "name": "github.copilot.session.usage_info",
-                            "attributes": {
-                                "github.copilot.current_tokens": 5000 + i * 100,
-                                "github.copilot.token_limit": 100000,
-                            },
-                        }
-                    ],
-                    "status": {"code": 0},
-                }
-                fh.write(json.dumps(record) + "\n")
-
-    @staticmethod
-    def _economics_events(
-        model: str,
-        at: Callable[[], str],
-        out_total: int,
-        lines_added: int,
-        lines_removed: int,
-    ) -> list[dict]:
-        """A self-consistent ``session.compaction_complete`` + ``session.shutdown`` pair.
-
-        Token counts are priced with :mod:`pricing`'s documented rates so the synthetic
-        ``totalNanoAiu`` reconciles exactly with the per-type decomposition -- exercising the full
-        economics path (including ``rates_from_compaction``) entirely offline.
-        """
-        rates = pricing.default_rates()
-        counts = {
-            "input": 1500,
-            "cache_read": 12_000,
-            "cache_write": 2_000,
-            "output": out_total,
-        }
-        reasoning_tokens = 120
-        total_nano = int(sum(counts[t] * rates[t] for t in pricing.TOKEN_TYPES))
-        input_billed = counts["input"] + counts["cache_read"] + counts["cache_write"]
-        n_requests = 4
-        return [
-            {
-                "type": "session.compaction_complete",
-                "timestamp": at(),
-                "data": {
-                    "compactionTokensUsed": {
-                        "copilotUsage": {
-                            "totalNanoAiu": 5_000_000,
-                            "tokenDetails": [
-                                {
-                                    "tokenType": t,
-                                    "tokenCount": counts.get(t, 0),
-                                    "batchSize": 1_000_000,
-                                    "costPerBatch": pricing.DEFAULT_COST_PER_BATCH[t],
-                                }
-                                for t in pricing.TOKEN_TYPES
-                            ],
-                        }
-                    },
-                    "systemTokens": 9000,
-                    "conversationTokens": 4000,
-                    "toolDefinitionsTokens": 3000,
-                },
-            },
-            {
-                "type": "session.shutdown",
-                "timestamp": at(),
-                "data": {
-                    "tokenDetails": {
-                        "input": {"tokenCount": counts["input"]},
-                        "cache_read": {"tokenCount": counts["cache_read"]},
-                        "cache_write": {"tokenCount": counts["cache_write"]},
-                        "output": {"tokenCount": counts["output"]},
-                    },
-                    "totalNanoAiu": total_nano,
-                    "totalApiDurationMs": 1234 * n_requests,
-                    "modelMetrics": {
-                        model: {
-                            "requests": {"count": n_requests},
-                            "usage": {
-                                "inputTokens": input_billed,
-                                "outputTokens": counts["output"],
-                                "cacheReadTokens": counts["cache_read"],
-                                "cacheWriteTokens": counts["cache_write"],
-                                "reasoningTokens": reasoning_tokens,
-                            },
-                            "totalNanoAiu": total_nano,
-                        }
-                    },
-                    "systemTokens": 9000,
-                    "conversationTokens": 4000,
-                    "toolDefinitionsTokens": 3000,
-                    "currentTokens": 16000,
-                    "codeChanges": {
-                        "filesModified": ["mock_file.py"],
-                        "linesAdded": lines_added,
-                        "linesRemoved": lines_removed,
-                    },
-                },
-            },
-        ]
diff --git a/src/copilot_experiments/models.py b/src/copilot_experiments/models.py
index 5a70a52..8b58b9e 100644
--- a/src/copilot_experiments/models.py
+++ b/src/copilot_experiments/models.py
@@ -1,216 +1,10 @@
-"""Pydantic models: experiment definitions and result objects."""
+"""Pydantic models for Copilot session metrics and analysis."""
 
 from __future__ import annotations
 
-import re
-from typing import Literal
+from pydantic import BaseModel, Field
 
-from pydantic import BaseModel, ConfigDict, Field, model_validator
 
-from ._util import slugify
-
-ReasoningEffort = Literal["none", "low", "medium", "high", "xhigh", "max"]
-Mode = Literal["interactive", "plan", "autopilot"]
-ProviderType = Literal["openai", "azure", "anthropic"]
-WireApi = Literal["completions", "responses"]
-
-# Outcome of a single trial, distinguishing *harness/infra* failures from the
-# experiment's own (verify) result:
-#   * ``ok``             -- Copilot ran to completion (verify pass/fail is separate).
-#   * ``copilot_failed`` -- Copilot was invoked but errored out / produced no session
-#                           log (e.g. authentication failure, bad working dir).
-#   * ``harness_error``  -- the harness pipeline itself raised (provisioning, diffing).
-TrialStatus = Literal["ok", "copilot_failed", "harness_error"]
-
-# Roll-up of a run: every trial ``ok`` -> ``completed``; some but not all failed ->
-# ``partial``; nothing ran successfully -> ``failed``.
-RunStatus = Literal["completed", "partial", "failed"]
-
-# Environment variable names whose *value* should be masked in stored artifacts.
-# A safety net: BYOK secrets belong in ``ProviderConfig`` (already redacted), but a
-# token set via the free-form ``Variant.env`` escape hatch must never be persisted.
-_SECRET_ENV_HINT = re.compile(
-    r"key|token|secret|password|passwd|bearer|credential|authorization", re.IGNORECASE
-)
-
-
-def _redact_env(env: dict[str, str]) -> dict[str, str]:
-    """Mask values of environment variables whose name hints at a secret."""
-    return {k: ("***redacted***" if _SECRET_ENV_HINT.search(k) else v) for k, v in env.items()}
-
-
-# --------------------------------------------------------------------------- #
-# Experiment definition
-# --------------------------------------------------------------------------- #
-class ProviderConfig(BaseModel):
-    """Bring-Your-Own-Key custom model provider.
-
-    Translated to ``COPILOT_PROVIDER_*`` environment variables when a variant
-    using this provider is executed. Works with any OpenAI-compatible endpoint
-    (Ollama, vLLM, Foundry Local), Azure OpenAI, or Anthropic.
-    """
-
-    model_config = ConfigDict(extra="forbid")
-
-    base_url: str
-    type: ProviderType = "openai"
-    api_key: str | None = None
-    bearer_token: str | None = None
-    wire_api: WireApi | None = None
-    model_id: str | None = None
-    wire_model: str | None = None
-    azure_api_version: str | None = None
-    max_prompt_tokens: int | None = None
-    max_output_tokens: int | None = None
-
-    def to_env(self) -> dict[str, str]:
-        """Render the provider config as Copilot CLI environment variables."""
-        env: dict[str, str] = {
-            "COPILOT_PROVIDER_BASE_URL": self.base_url,
-            "COPILOT_PROVIDER_TYPE": self.type,
-        }
-        if self.api_key:
-            env["COPILOT_PROVIDER_API_KEY"] = self.api_key
-        if self.bearer_token:
-            env["COPILOT_PROVIDER_BEARER_TOKEN"] = self.bearer_token
-        if self.wire_api:
-            env["COPILOT_PROVIDER_WIRE_API"] = self.wire_api
-        if self.model_id:
-            env["COPILOT_PROVIDER_MODEL_ID"] = self.model_id
-        if self.wire_model:
-            env["COPILOT_PROVIDER_WIRE_MODEL"] = self.wire_model
-        if self.azure_api_version:
-            env["COPILOT_PROVIDER_AZURE_API_VERSION"] = self.azure_api_version
-        if self.max_prompt_tokens is not None:
-            env["COPILOT_PROVIDER_MAX_PROMPT_TOKENS"] = str(self.max_prompt_tokens)
-        if self.max_output_tokens is not None:
-            env["COPILOT_PROVIDER_MAX_OUTPUT_TOKENS"] = str(self.max_output_tokens)
-        return env
-
-    def redacted(self) -> dict:
-        """Serializable representation with secrets masked, for stored artifacts."""
-        data = self.model_dump(exclude_none=True)
-        for secret in ("api_key", "bearer_token"):
-            if data.get(secret):
-                data[secret] = "***redacted***"
-        return data
-
-
-class Task(BaseModel):
-    """What Copilot is asked to do, and how to provision/verify the workspace."""
-
-    model_config = ConfigDict(extra="forbid")
-
-    name: str | None = None
-    """Human-readable task name. When set, it seeds the task's directory slug;
-    otherwise a positional ``task-NNN`` slug is assigned by the experiment."""
-
-    prompt: str
-    """The prompt handed to ``copilot -p``."""
-
-    fixture: str | None = None
-    """Path (relative to the experiment repo) to a directory copied as the
-    starting workspace for every trial."""
-
-    repo: str | None = None
-    """Git URL to clone as the starting workspace (alternative to ``fixture``)."""
-
-    ref: str | None = None
-    """Branch, tag, or commit to check out when ``repo`` is used."""
-
-    setup: list[str] = Field(default_factory=list)
-    """Shell commands run in the workspace after provisioning, before Copilot."""
-
-    verify: str | None = None
-    """Shell command run in the workspace after Copilot finishes. Exit code 0
-    means the trial succeeded. ``None`` means effectiveness is not measured."""
-
-
-class Variant(BaseModel):
-    """A single parameterization of an experiment (one cell of the matrix)."""
-
-    model_config = ConfigDict(extra="forbid")
-
-    name: str
-    model: str | None = None
-    reasoning_effort: ReasoningEffort | None = None
-    agent: str | None = None
-    mode: Mode | None = None
-    allow_tools: list[str] = Field(default_factory=list)
-    deny_tools: list[str] = Field(default_factory=list)
-    allow_all_tools: bool = True
-    provider: ProviderConfig | None = None
-    env: dict[str, str] = Field(default_factory=dict)
-    extra_args: list[str] = Field(default_factory=list)
-    trials: int = 1
-
-    @property
-    def slug(self) -> str:
-        return slugify(self.name)
-
-    def stored(self) -> dict:
-        """Serializable representation with provider and env secrets redacted."""
-        data = self.model_dump(exclude_none=True)
-        if self.provider is not None:
-            data["provider"] = self.provider.redacted()
-        if self.env:
-            data["env"] = _redact_env(self.env)
-        return data
-
-
-class Experiment(BaseModel):
-    """A named task suite plus the matrix of variants to run it under.
-
-    The comparison matrix is ``Tasks × Variants × Trials``. Provide either a
-    single ``task`` (sugar for a one-task suite) or an explicit list of
-    ``tasks`` -- exactly one of the two. See ADR-0012.
-    """
-
-    model_config = ConfigDict(extra="forbid")
-
-    name: str
-    description: str = ""
-    task: Task | None = None
-    tasks: list[Task] = Field(default_factory=list)
-    variants: list[Variant]
-
-    @model_validator(mode="after")
-    def _check_task_suite(self) -> Experiment:
-        if self.task is not None and self.tasks:
-            raise ValueError("Provide either 'task' or 'tasks', not both.")
-        if self.task is None and not self.tasks:
-            raise ValueError("An experiment must define a 'task' or a non-empty 'tasks' list.")
-        return self
-
-    @property
-    def slug(self) -> str:
-        return slugify(self.name)
-
-    def iter_tasks(self) -> list[tuple[str, Task]]:
-        """Return the task suite as an ordered list of ``(task_slug, Task)``.
-
-        Slugs come from ``Task.name`` when set, else a positional ``task-NNN``.
-        Collisions are disambiguated with a numeric suffix so slugs are unique
-        and stable for directory names and the index.
-        """
-        tasks = self.tasks if self.tasks else ([self.task] if self.task else [])
-        result: list[tuple[str, Task]] = []
-        seen: dict[str, int] = {}
-        for idx, task in enumerate(tasks, start=1):
-            base = slugify(task.name) if task.name else f"task-{idx:03d}"
-            if base in seen:
-                seen[base] += 1
-                slug = f"{base}-{seen[base]}"
-            else:
-                seen[base] = 1
-                slug = base
-            result.append((slug, task))
-        return result
-
-
-# --------------------------------------------------------------------------- #
-# Result objects
-# --------------------------------------------------------------------------- #
 class ModelMetric(BaseModel):
     """Per-model usage from ``session.shutdown.modelMetrics`` (multi-model sessions)."""
 
@@ -276,8 +70,8 @@ class TokenEconomics(BaseModel):
 class Metrics(BaseModel):
     """Metrics parsed from a single trial's session ``events.jsonl``.
 
-    Flat scalars for aggregation and the SQLite index. The richer, nested view lives in
-    :class:`TokenEconomics` on :class:`SessionAnalysis`; both are derived from the same events.
+    Flat scalars for aggregation. The richer, nested view lives in :class:`TokenEconomics`
+    on :class:`SessionAnalysis`; both are derived from the same events.
     """
 
     n_turns: int = 0
@@ -441,147 +235,3 @@ class SessionAnalysis(BaseModel):
     phases: list[PhaseStat] = Field(default_factory=list)
     warnings: list[str] = Field(default_factory=list)
     event_type_counts: dict[str, int] = Field(default_factory=dict)
-
-
-class TrialResult(BaseModel):
-    trial_no: int
-    session_id: str
-    exit_code: int
-    duration_s: float
-    success: bool | None = None
-    metrics: Metrics = Field(default_factory=Metrics)
-
-    # Harness/infra outcome (orthogonal to ``success``, which is the experiment's
-    # verify result). ``error`` is a short human-readable message; ``error_artifact``
-    # names the file inside the trial directory to inspect for the full story.
-    status: TrialStatus = "ok"
-    error: str | None = None
-    error_artifact: str | None = None
-
-    @property
-    def failed(self) -> bool:
-        """True when the trial did not run cleanly (harness or copilot failure)."""
-        return self.status != "ok"
-
-
-class TaskResult(BaseModel):
-    """All trials of one task within a variant (one cell of the suite × matrix)."""
-
-    task_slug: str
-    task_name: str | None = None
-    prompt: str | None = None
-    trials: list[TrialResult] = Field(default_factory=list)
-
-    @property
-    def success_rate(self) -> float | None:
-        """Mean trial success for this task (the variability-aware measure)."""
-        graded = [t.success for t in self.trials if t.success is not None]
-        if not graded:
-            return None
-        return sum(1 for s in graded if s) / len(graded)
-
-    @property
-    def n_failed(self) -> int:
-        """Number of trials that did not run cleanly (harness/copilot failures)."""
-        return sum(1 for t in self.trials if t.failed)
-
-    @property
-    def resolved(self) -> bool | None:
-        """Resolved@k: did *any* trial of this task pass (best-of-k)?"""
-        graded = [t.success for t in self.trials if t.success is not None]
-        if not graded:
-            return None
-        return any(graded)
-
-
-class VariantResult(BaseModel):
-    variant: Variant
-    tasks: list[TaskResult] = Field(default_factory=list)
-
-    @property
-    def all_trials(self) -> list[TrialResult]:
-        """Every trial across every task, flattened (for cost/token aggregates)."""
-        return [t for tr in self.tasks for t in tr.trials]
-
-    @property
-    def success_rate(self) -> float | None:
-        """Mean trial success across all tasks and trials of this variant."""
-        graded = [t.success for t in self.all_trials if t.success is not None]
-        if not graded:
-            return None
-        return sum(1 for s in graded if s) / len(graded)
-
-    @property
-    def mean_resolved_rate(self) -> float | None:
-        """Mean over tasks of each task's mean trial success."""
-        rates = [tr.success_rate for tr in self.tasks if tr.success_rate is not None]
-        if not rates:
-            return None
-        return sum(rates) / len(rates)
-
-    @property
-    def resolved_at_k_rate(self) -> float | None:
-        """Fraction of tasks resolved on at least one trial (best-of-k)."""
-        graded = [tr.resolved for tr in self.tasks if tr.resolved is not None]
-        if not graded:
-            return None
-        return sum(1 for r in graded if r) / len(graded)
-
-
-class ExperimentRun(BaseModel):
-    run_id: str
-    experiment_slug: str
-    experiment_name: str
-    experiment_description: str = ""
-    started_at: str
-    finished_at: str | None = None
-    git_base: str | None = None
-    status: str = "running"
-    variants: list[VariantResult] = Field(default_factory=list)
-
-    @property
-    def all_trials(self) -> list[TrialResult]:
-        return [t for vr in self.variants for t in vr.all_trials]
-
-    @property
-    def n_failed_trials(self) -> int:
-        return sum(1 for t in self.all_trials if t.failed)
-
-    def rollup_status(self) -> RunStatus:
-        """Derive the run status from its trials' harness/copilot outcomes."""
-        trials = self.all_trials
-        if not trials:
-            return "failed"
-        failed = self.n_failed_trials
-        if failed == 0:
-            return "completed"
-        if failed == len(trials):
-            return "failed"
-        return "partial"
-
-
-# --------------------------------------------------------------------------- #
-# Dry-run (ephemeral plumbing check)
-# --------------------------------------------------------------------------- #
-class DryRunCheck(BaseModel):
-    """One validated stage of the run pipeline during a ``--dry-run``."""
-
-    name: str
-    ok: bool
-    detail: str = ""
-
-
-class DryRunReport(BaseModel):
-    """Result of an ephemeral dry-run: did each pipeline stage do its job?
-
-    A dry-run runs the whole pipeline (with the mock invoker) inside a throwaway
-    directory, records these checks, then deletes everything. Nothing is
-    persisted; only this report survives.
-    """
-
-    experiment: str
-    checks: list[DryRunCheck] = Field(default_factory=list)
-
-    @property
-    def ok(self) -> bool:
-        return all(c.ok for c in self.checks)
diff --git a/src/copilot_experiments/pier_backend.py b/src/copilot_experiments/pier_backend.py
index 672238e..0081974 100644
--- a/src/copilot_experiments/pier_backend.py
+++ b/src/copilot_experiments/pier_backend.py
@@ -215,28 +215,19 @@ def _job_dir(config: Any) -> Path:
 
 
 def _latest_existing_run_dir(config: Any) -> Path | None:
-    """Return the latest resumable run directory for a stable job config.
+    """Return the latest resumable run directory for a stable job config."""
 
-    New runs live at ``jobs/<job_name>/<run_id>``. A pre-migration flat
-    ``jobs/<job_name>`` directory may also exist, so keep it resumable when no
-    nested run has been created yet.
-    """
-
-    flat_dir = _job_dir(config)
-    nested_root = flat_dir
-    nested = []
-    if nested_root.is_dir():
-        nested = sorted(
+    job_group = _job_dir(config)
+    if job_group.is_dir():
+        runs = sorted(
             path
-            for path in nested_root.iterdir()
+            for path in job_group.iterdir()
             if path.is_dir()
             and (path / "config.json").exists()
             and (path / PIER_RUN_MANIFEST).exists()
         )
-    if nested:
-        return nested[-1]
-    if flat_dir.is_dir() and (flat_dir / "config.json").exists():
-        return flat_dir
+        if runs:
+            return runs[-1]
     return None
 
 
diff --git a/src/copilot_experiments/pier_results.py b/src/copilot_experiments/pier_results.py
index 2dd6be3..3bbd5da 100644
--- a/src/copilot_experiments/pier_results.py
+++ b/src/copilot_experiments/pier_results.py
@@ -35,24 +35,24 @@ def iter_pier_trial_summaries(job_dir: Path) -> list[dict[str, Any]]:
 
 
 def build_pier_summary(job_dir: Path) -> dict[str, Any]:
-    """Build the familiar summary shape from a Pier job directory."""
+    """Build an agent-oriented summary from a Pier job directory."""
 
     job_dir = Path(job_dir)
     job_result = read_json(job_dir / "result.json")
     job_config = read_json(job_dir / "config.json") if (job_dir / "config.json").exists() else {}
     identity = pier_job_identity(job_dir, job_config)
 
-    variant_cells: dict[str, dict[str, Any]] = {}
+    agent_cells: dict[str, dict[str, Any]] = {}
     for row in iter_pier_trial_summaries(job_dir):
-        variant_key = row["variant"]
-        cell = variant_cells.setdefault(
-            variant_key,
+        agent_key = row["agent"]
+        cell = agent_cells.setdefault(
+            agent_key,
             {
-                "variant": variant_key,
-                "name": variant_key,
+                "agent": agent_key,
+                "name": agent_key,
+                "agent_name": row.get("agent_name"),
                 "model": row.get("model"),
                 "reasoning_effort": row.get("reasoning_effort"),
-                "byok": False,
                 "n_tasks": 0,
                 "n_trials": 0,
                 "tasks": defaultdict(list),
@@ -61,8 +61,8 @@ def build_pier_summary(job_dir: Path) -> dict[str, Any]:
         cell["n_trials"] += 1
         cell["tasks"][row["task"]].append(row)
 
-    variants = []
-    for cell in variant_cells.values():
+    agents = []
+    for cell in agent_cells.values():
         task_summaries = []
         all_trials = []
         for task_slug, trials in sorted(cell["tasks"].items()):
@@ -70,25 +70,23 @@ def build_pier_summary(job_dir: Path) -> dict[str, Any]:
             task_summaries.append(_aggregate_task(task_slug, trials))
         cell["tasks"] = task_summaries
         cell["n_tasks"] = len(task_summaries)
-        cell.update(_aggregate_variant(all_trials))
-        variants.append(cell)
+        cell.update(_aggregate_agent(all_trials))
+        agents.append(cell)
 
-    all_trials = [
-        trial for variant in variants for task in variant["tasks"] for trial in task["_trials"]
-    ]
+    all_trials = [trial for agent in agents for task in agent["tasks"] for trial in task["_trials"]]
     graded = [trial["success"] for trial in all_trials if trial.get("success") is not None]
     total_aiu = sum((trial.get("metrics") or {}).get("aiu") or 0 for trial in all_trials)
 
     summary = {
         "run_id": identity["run_id"],
-        "experiment": identity["job_name"],
-        "experiment_slug": identity["job_name"],
+        "job": identity["job_name"],
+        "job_name": identity["job_name"],
         "pier_job_id": identity["id"],
         "started_at": job_result.get("started_at"),
         "finished_at": job_result.get("finished_at"),
         "status": _job_status(job_result),
-        "n_variants": len(variants),
-        "n_tasks": max((variant.get("n_tasks", 0) for variant in variants), default=0),
+        "n_agents": len(agents),
+        "n_tasks": max((agent.get("n_tasks", 0) for agent in agents), default=0),
         "n_trials": len(all_trials),
         "n_failed_trials": sum(1 for trial in all_trials if trial.get("status") != "ok"),
         "n_harness_errors": sum(
@@ -101,7 +99,7 @@ def build_pier_summary(job_dir: Path) -> dict[str, Any]:
             (sum(1 for value in graded if value) / len(graded)) if graded else None
         ),
         "total_aiu": round(total_aiu, 3) if total_aiu else None,
-        "variants": [_strip_internal_trials(variant) for variant in variants],
+        "agents": [_strip_internal_trials(agent) for agent in agents],
     }
     return summary
 
@@ -146,8 +144,8 @@ def pier_job_identity(job_dir: Path, job_config: dict[str, Any] | None = None) -
     config = job_config or (
         read_json(job_dir / "config.json") if (job_dir / "config.json").exists() else {}
     )
-    job_name = str(config.get("job_name") or job_dir.name)
-    return {"job_name": job_name, "run_id": job_dir.name, "id": job_dir.name}
+    job_name = str(config.get("job_name") or job_dir.parent.name)
+    return {"job_name": job_name, "run_id": job_dir.name, "id": f"{job_name}/{job_dir.name}"}
 
 
 def pier_job_label(job_dir: Path) -> str:
@@ -238,10 +236,12 @@ def _trial_summary(trial_dir: Path, trial: dict[str, Any]) -> dict[str, Any]:
 
     return {
         "trial_no": _trial_number(trial_dir),
+        "trial_dir": trial_dir.name,
         "trial_name": trial.get("trial_name") or trial_dir.name,
         "task": task_name,
         "task_name": task_name,
-        "variant": _variant_name(agent, model_info),
+        "agent": _agent_label(agent, model_info),
+        "agent_name": agent.get("name") or "agent",
         "model": model_info.get("name"),
         "reasoning_effort": (
             ((trial.get("config") or {}).get("agent") or {})
@@ -333,7 +333,7 @@ def _aggregate_task(task_slug: str, trials: list[dict[str, Any]]) -> dict[str, A
     }
 
 
-def _aggregate_variant(trials: list[dict[str, Any]]) -> dict[str, Any]:
+def _aggregate_agent(trials: list[dict[str, Any]]) -> dict[str, Any]:
     graded = [trial["success"] for trial in trials if trial.get("success") is not None]
     solved = sum(1 for value in graded if value)
     aiu_values = [(trial.get("metrics") or {}).get("aiu") for trial in trials]
@@ -370,13 +370,13 @@ def _aggregate_variant(trials: list[dict[str, Any]]) -> dict[str, Any]:
     }
 
 
-def _strip_internal_trials(variant: dict[str, Any]) -> dict[str, Any]:
-    variant = dict(variant)
-    variant["tasks"] = [
+def _strip_internal_trials(agent: dict[str, Any]) -> dict[str, Any]:
+    agent = dict(agent)
+    agent["tasks"] = [
         {key: value for key, value in task.items() if key != "_trials"}
-        for task in variant.get("tasks", [])
+        for task in agent.get("tasks", [])
     ]
-    return variant
+    return agent
 
 
 def _avg(values: list[Any]) -> float | None:
@@ -399,7 +399,7 @@ def _duration_seconds(started_at: str | None, finished_at: str | None) -> float
     return round((finish - start).total_seconds(), 3)
 
 
-def _variant_name(agent: dict[str, Any], model: dict[str, Any]) -> str:
+def _agent_label(agent: dict[str, Any], model: dict[str, Any]) -> str:
     agent_name = agent.get("name") or "agent"
     model_name = model.get("name")
     return f"{agent_name}-{model_name}" if model_name else agent_name
diff --git a/src/copilot_experiments/report.py b/src/copilot_experiments/report.py
index 37123ef..445fa6c 100644
--- a/src/copilot_experiments/report.py
+++ b/src/copilot_experiments/report.py
@@ -1,134 +1,7 @@
-"""Aggregate trial metrics into run summaries and human-readable reports."""
+"""Markdown reports for Pier job runs."""
 
 from __future__ import annotations
 
-from statistics import mean, stdev
-
-from .models import ExperimentRun, TaskResult, VariantResult
-
-
-def _avg(values: list[float]) -> float | None:
-    nums = [v for v in values if v is not None]
-    return round(mean(nums), 3) if nums else None
-
-
-def _std(values: list[float]) -> float | None:
-    nums = [v for v in values if v is not None]
-    return round(stdev(nums), 3) if len(nums) >= 2 else (0.0 if nums else None)
-
-
-def _cv(values: list[float]) -> float | None:
-    """Coefficient of variation (std / mean) -- the paper's headline variability measure."""
-    nums = [v for v in values if v is not None]
-    if len(nums) < 2:
-        return None
-    m = mean(nums)
-    return round(stdev(nums) / m, 3) if m else None
-
-
-def _vals(trials: list, attr: str) -> list[float]:
-    out = []
-    for t in trials:
-        v = getattr(t.metrics, attr, None)
-        if v is not None:
-            out.append(float(v))
-    return out
-
-
-def aggregate_task(tr: TaskResult) -> dict:
-    """Per-(variant, task) cell: success, cost, and cross-trial variability."""
-    trials = tr.trials
-    graded = [t.success for t in trials if t.success is not None]
-    n_solved = sum(1 for s in graded if s)
-    aiu = _vals(trials, "aiu")
-    tokens = _vals(trials, "total_tokens")
-    total_aiu = sum(aiu) if aiu else None
-    return {
-        "task": tr.task_slug,
-        "name": tr.task_name,
-        "n_trials": len(trials),
-        "success_rate": tr.success_rate,
-        "resolved": tr.resolved,
-        "avg_duration_s": _avg([t.duration_s for t in trials]),
-        "avg_turns": _avg([float(t.metrics.n_turns) for t in trials]),
-        "avg_total_tokens": _avg(tokens),
-        "cv_total_tokens": _cv(tokens),
-        "avg_aiu": _avg(aiu),
-        "cv_aiu": _cv(aiu),
-        "total_aiu": round(total_aiu, 3) if total_aiu is not None else None,
-        "aiu_per_solve": (round(total_aiu / n_solved, 3) if total_aiu and n_solved else None),
-    }
-
-
-def aggregate_variant(vr: VariantResult) -> dict:
-    trials = vr.all_trials
-    graded = [t.success for t in trials if t.success is not None]
-    n_solved = sum(1 for s in graded if s)
-    aiu = _vals(trials, "aiu")
-    tokens = _vals(trials, "total_tokens")
-    total_aiu = sum(aiu) if aiu else None
-    return {
-        "variant": vr.variant.slug,
-        "name": vr.variant.name,
-        "model": vr.variant.model,
-        "reasoning_effort": vr.variant.reasoning_effort,
-        "byok": vr.variant.provider is not None,
-        "n_tasks": len(vr.tasks),
-        "n_trials": len(trials),
-        # Trial-level mean success (unchanged meaning) plus the two suite measures.
-        "success_rate": (n_solved / len(graded)) if graded else None,
-        "mean_resolved_rate": vr.mean_resolved_rate,
-        "resolved_at_k_rate": vr.resolved_at_k_rate,
-        "avg_duration_s": _avg([t.duration_s for t in trials]),
-        "avg_turns": _avg([float(t.metrics.n_turns) for t in trials]),
-        "avg_tool_calls": _avg([float(t.metrics.n_tool_calls) for t in trials]),
-        "avg_tool_failures": _avg([float(t.metrics.n_tool_failures) for t in trials]),
-        "avg_total_tokens": _avg(tokens),
-        "std_total_tokens": _std(tokens),
-        "cv_total_tokens": _cv(tokens),
-        "avg_input_tokens": _avg(_vals(trials, "input_tokens")),
-        "avg_output_tokens": _avg(_vals(trials, "output_tokens")),
-        "avg_cache_read_tokens": _avg(_vals(trials, "cache_read_tokens")),
-        "avg_reasoning_tokens": _avg(_vals(trials, "reasoning_tokens")),
-        "avg_aiu": _avg(aiu),
-        "std_aiu": _std(aiu),
-        "cv_aiu": _cv(aiu),
-        "total_aiu": round(total_aiu, 3) if total_aiu is not None else None,
-        # Cost-vs-accuracy: AIU spent per successfully solved task (lower is better).
-        "aiu_per_solve": (round(total_aiu / n_solved, 3) if total_aiu and n_solved else None),
-        "avg_lines_added": _avg(_vals(trials, "lines_added")),
-        "avg_files_modified": _avg(_vals(trials, "files_modified")),
-        "avg_api_duration_s": _avg([v / 1000 for v in _vals(trials, "api_duration_ms")]),
-        "tasks": [aggregate_task(tr) for tr in vr.tasks],
-    }
-
-
-def build_summary(run: ExperimentRun) -> dict:
-    variant_summaries = [aggregate_variant(vr) for vr in run.variants]
-    all_trials = [t for vr in run.variants for t in vr.all_trials]
-    graded = [t.success for t in all_trials if t.success is not None]
-    total_aiu = sum(_vals(all_trials, "aiu")) if all_trials else 0.0
-    n_harness_errors = sum(1 for t in all_trials if t.status == "harness_error")
-    n_copilot_failures = sum(1 for t in all_trials if t.status == "copilot_failed")
-    n_tasks = max((len(vr.tasks) for vr in run.variants), default=0)
-    return {
-        "run_id": run.run_id,
-        "experiment": run.experiment_name,
-        "experiment_slug": run.experiment_slug,
-        "started_at": run.started_at,
-        "finished_at": run.finished_at,
-        "status": run.status,
-        "n_variants": len(run.variants),
-        "n_tasks": n_tasks,
-        "n_trials": len(all_trials),
-        "n_failed_trials": n_harness_errors + n_copilot_failures,
-        "n_harness_errors": n_harness_errors,
-        "n_copilot_failures": n_copilot_failures,
-        "overall_success_rate": (sum(1 for s in graded if s) / len(graded)) if graded else None,
-        "total_aiu": round(total_aiu, 3) if total_aiu else None,
-        "variants": variant_summaries,
-    }
-
 
 def _fmt(value: object) -> str:
     if value is None:
@@ -149,14 +22,17 @@ def _pct(value: float | None) -> str:
 
 
 def summary_markdown(summary: dict, description: str = "") -> str:
+    """Render a Pier run summary as Markdown."""
+
     lines = [
-        f"# {summary['experiment']}",
+        f"# {summary['job']}",
         "",
         f"- **Run:** `{summary['run_id']}`",
+        f"- **Selector:** `{summary['pier_job_id']}`",
         f"- **Status:** {summary.get('status', '-')}",
         f"- **Started:** {summary['started_at']}",
         f"- **Finished:** {summary.get('finished_at') or '-'}",
-        f"- **Variants:** {summary['n_variants']} · **Tasks:** {summary.get('n_tasks', 1)} "
+        f"- **Agents:** {summary['n_agents']} · **Tasks:** {summary.get('n_tasks', 1)} "
         f"· **Trials:** {summary['n_trials']}",
         f"- **Overall success rate:** {_pct(summary['overall_success_rate'])}",
         f"- **Total cost:** {_aiu(summary.get('total_aiu'))} AIU",
@@ -166,100 +42,82 @@ def summary_markdown(summary: dict, description: str = "") -> str:
         lines.append(
             f"- **⚠ Harness failures:** {n_failed} trial(s) did not run cleanly "
             f"({summary.get('n_harness_errors', 0)} harness, "
-            f"{summary.get('n_copilot_failures', 0)} copilot) — see each trial's "
-            "`stdout.txt`."
+            f"{summary.get('n_copilot_failures', 0)} copilot)."
         )
     if description:
         lines += ["", description]
+
     lines += [
         "",
-        "| Variant | Model | Effort | BYOK | Trials | Success | Avg dur (s) | Avg turns "
+        "| Agent | Model | Effort | Tasks | Trials | Success | Avg dur (s) | Avg turns "
         "| Tool calls | Tool fails | Avg tokens |",
-        "| --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: |",
+        "| --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |",
     ]
-    for v in summary["variants"]:
+    for agent in summary["agents"]:
         lines.append(
-            "| {name} | {model} | {effort} | {byok} | {n} | {sr} | {dur} | {turns} | "
-            "{calls} | {fails} | {tokens} |".format(
-                name=v["name"],
-                model=_fmt(v["model"]),
-                effort=_fmt(v["reasoning_effort"]),
-                byok="yes" if v["byok"] else "no",
-                n=v["n_trials"],
-                sr=_pct(v["success_rate"]),
-                dur=_fmt(v["avg_duration_s"]),
-                turns=_fmt(v["avg_turns"]),
-                calls=_fmt(v["avg_tool_calls"]),
-                fails=_fmt(v["avg_tool_failures"]),
-                tokens=_fmt(v["avg_total_tokens"]),
+            "| {name} | {model} | {effort} | {tasks} | {trials} | {success} | {dur} | "
+            "{turns} | {calls} | {fails} | {tokens} |".format(
+                name=agent["name"],
+                model=_fmt(agent["model"]),
+                effort=_fmt(agent["reasoning_effort"]),
+                tasks=agent["n_tasks"],
+                trials=agent["n_trials"],
+                success=_pct(agent["success_rate"]),
+                dur=_fmt(agent["avg_duration_s"]),
+                turns=_fmt(agent["avg_turns"]),
+                calls=_fmt(agent["avg_tool_calls"]),
+                fails=_fmt(agent["avg_tool_failures"]),
+                tokens=_fmt(agent["avg_total_tokens"]),
             )
         )
 
-    # Cost, variability, and productivity -- the paper's token-economics lens.
-    if any(v.get("avg_aiu") is not None for v in summary["variants"]):
+    if any(agent.get("avg_aiu") is not None for agent in summary["agents"]):
         lines += [
             "",
-            "## Cost & token economics",
-            "",
-            "| Variant | Avg AIU | AIU CV | AIU / solve | Avg tokens | Token CV "
-            "| Avg cache-read | Avg lines + | API time (s) |",
-            "| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |",
+            "| Agent | Avg AIU | AIU CV | AIU / solve | Avg tokens | Token CV "
+            "| Avg API duration (s) |",
+            "| --- | ---: | ---: | ---: | ---: | ---: | ---: |",
         ]
-        for v in summary["variants"]:
+        for agent in summary["agents"]:
             lines.append(
-                "| {name} | {aiu} | {cva} | {aps} | {tok} | {cvt} | {cr} | {la} | {api} |".format(
-                    name=v["name"],
-                    aiu=_aiu(v.get("avg_aiu")),
-                    cva=_fmt(v.get("cv_aiu")),
-                    aps=_aiu(v.get("aiu_per_solve")),
-                    tok=_fmt(v.get("avg_total_tokens")),
-                    cvt=_fmt(v.get("cv_total_tokens")),
-                    cr=_fmt(v.get("avg_cache_read_tokens")),
-                    la=_fmt(v.get("avg_lines_added")),
-                    api=_fmt(v.get("avg_api_duration_s")),
+                "| {name} | {avg_aiu} | {aiu_cv} | {aiu_solve} | {tokens} | {token_cv} | "
+                "{api} |".format(
+                    name=agent["name"],
+                    avg_aiu=_aiu(agent.get("avg_aiu")),
+                    aiu_cv=_fmt(agent.get("aiu_cv")),
+                    aiu_solve=_aiu(agent.get("aiu_per_solve")),
+                    tokens=_fmt(agent.get("avg_total_tokens")),
+                    token_cv=_fmt(agent.get("total_tokens_cv")),
+                    api=_fmt(agent.get("avg_api_duration_s")),
                 )
             )
-    # Suite coverage: both measures side by side (mean-success and resolved@k).
+
     if summary.get("n_tasks", 1) > 1:
         lines += [
             "",
-            "## Suite coverage",
+            "## Task suite coverage",
             "",
-            "| Variant | Tasks | Mean success | Resolved@k |",
+            "| Agent | Tasks | Mean success | Resolved@k |",
             "| --- | ---: | ---: | ---: |",
         ]
-        for v in summary["variants"]:
+        for agent in summary["agents"]:
             lines.append(
-                "| {name} | {nt} | {ms} | {rk} |".format(
-                    name=v["name"],
-                    nt=v.get("n_tasks", "-"),
-                    ms=_pct(v.get("mean_resolved_rate")),
-                    rk=_pct(v.get("resolved_at_k_rate")),
-                )
+                f"| {agent['name']} | {agent.get('n_tasks', '-')} | "
+                f"{_pct(agent.get('mean_resolved_rate'))} | "
+                f"{_pct(agent.get('resolved_at_k_rate'))} |"
             )
 
-        # Per-task breakdown: which tasks each variant solved (mean success).
         lines += [
             "",
-            "## Per-task breakdown",
-            "",
-            "| Variant | Task | Trials | Mean success | Resolved@k | Avg AIU |",
+            "| Agent | Task | Trials | Mean success | Resolved@k | Avg AIU |",
             "| --- | --- | ---: | ---: | ---: | ---: |",
         ]
-        for v in summary["variants"]:
-            for t in v.get("tasks", []):
-                resolved = t.get("resolved")
-                rk = "-" if resolved is None else ("yes" if resolved else "no")
+        for agent in summary["agents"]:
+            for task in agent.get("tasks", []):
                 lines.append(
-                    "| {vn} | {tn} | {n} | {ms} | {rk} | {aiu} |".format(
-                        vn=v["name"],
-                        tn=t.get("name") or t["task"],
-                        n=t["n_trials"],
-                        ms=_pct(t.get("success_rate")),
-                        rk=rk,
-                        aiu=_aiu(t.get("avg_aiu")),
-                    )
+                    f"| {agent['name']} | {task['task_slug']} | {task['n_trials']} | "
+                    f"{_pct(task.get('success_rate'))} | {_pct(task.get('resolved_rate'))} | "
+                    f"{_aiu(task.get('avg_aiu'))} |"
                 )
 
-    lines.append("")
-    return "\n".join(lines)
+    return "\n".join(lines) + "\n"
diff --git a/src/copilot_experiments/runner.py b/src/copilot_experiments/runner.py
deleted file mode 100644
index f2f7795..0000000
--- a/src/copilot_experiments/runner.py
+++ /dev/null
@@ -1,591 +0,0 @@
-"""Orchestrate running an experiment: variants x trials -> result artifacts."""
-
-from __future__ import annotations
-
-import os
-import subprocess
-import tempfile
-from collections.abc import Callable
-from pathlib import Path
-
-from ._util import (
-    force_rmtree,
-    iso,
-    new_run_id,
-    new_session_id,
-    read_json,
-    utcnow,
-    write_json,
-    write_text,
-)
-from .analysis import analyze_events
-from .auth import INJECTED_TOKEN_ENV_VAR, secret_env_names
-from .index import connect, index_run_dir
-from .invoker import CopilotInvoker, Invocation, Invoker, MockInvoker
-from .models import (
-    DryRunCheck,
-    DryRunReport,
-    Experiment,
-    ExperimentRun,
-    Metrics,
-    Task,
-    TaskResult,
-    TrialResult,
-    TrialStatus,
-    Variant,
-    VariantResult,
-)
-from .report import build_summary, summary_markdown
-from .sessionlog import copy_events, load_events, parse_metrics
-from .storage import Layout
-from .workspace import capture_diff, provision, run_shell
-
-
-def _git_head(root: Path) -> str | None:
-    proc = subprocess.run(
-        ["git", "rev-parse", "HEAD"], cwd=str(root), capture_output=True, text=True
-    )
-    return proc.stdout.strip() if proc.returncode == 0 else None
-
-
-def _report(progress: Callable[[str], None] | None, msg: str) -> None:
-    """Forward a human-readable progress line to ``progress`` if one is set."""
-    if progress is not None:
-        progress(msg)
-
-
-def run_experiment(
-    experiment: Experiment,
-    *,
-    root: Path | None = None,
-    invoker: Invoker | None = None,
-    results_root: Path | None = None,
-    session_state_root: Path | None = None,
-    copilot_binary: str = "copilot",
-    github_token: str | None = None,
-    progress: Callable[[str], None] | None = None,
-    copilot_stream: Callable[[str], None] | None = None,
-) -> ExperimentRun:
-    """Run every variant x trial of ``experiment`` and write result artifacts.
-
-    Parameters
-    ----------
-    root:
-        Experiment repository root (defaults to the current directory). Fixtures
-        and experiment definitions are read from here.
-    invoker:
-        Strategy used to invoke Copilot. Defaults to :class:`CopilotInvoker`.
-        Tests pass a :class:`MockInvoker`; :func:`dry_run_experiment` uses one too.
-    results_root:
-        Where run artifacts are written. Defaults to ``root/results``. Pointed at a
-        throwaway temp dir by :func:`dry_run_experiment` so nothing is persisted.
-    session_state_root:
-        Where Copilot session state lives. Defaults to ``~/.copilot/session-state``.
-    github_token:
-        Token injected into every trial's environment so Copilot is authenticated
-        without relying on ambient login. Resolved and preflighted by the CLI (see
-        :mod:`copilot_experiments.auth`). It is never persisted or logged, and the
-        variable carrying it is added to ``copilot --secret-env-vars``.
-    progress:
-        Optional sink for high-level per-trial phase messages (``--verbose``).
-    copilot_stream:
-        Optional sink for Copilot's live output, one rendered line at a time
-        (``--verbose``). Only used when the default :class:`CopilotInvoker` is built.
-    """
-    root = Path(root or Path.cwd()).resolve()
-    layout = Layout(root, results_root=results_root)
-
-    if invoker is None:
-        invoker = CopilotInvoker(binary=copilot_binary, stream=copilot_stream)
-
-    run_id = new_run_id()
-    run_dir = layout.run_dir(experiment.slug, run_id)
-    run_dir.mkdir(parents=True, exist_ok=True)
-
-    run = ExperimentRun(
-        run_id=run_id,
-        experiment_slug=experiment.slug,
-        experiment_name=experiment.name,
-        experiment_description=experiment.description,
-        started_at=iso(utcnow()),
-        git_base=_git_head(root),
-    )
-
-    for variant in experiment.variants:
-        _report(progress, f"variant {variant.slug}: {variant.trials} trial(s)")
-        vr = _run_variant(
-            experiment,
-            variant,
-            layout,
-            run_id,
-            invoker,
-            session_state_root,
-            github_token,
-            progress,
-        )
-        run.variants.append(vr)
-        write_json(
-            layout.variant_dir(experiment.slug, run_id, variant.slug) / "variant.json",
-            variant.stored(),
-        )
-
-    run.finished_at = iso(utcnow())
-    run.status = run.rollup_status()
-
-    # Write run manifest, summary, and report.
-    write_json(run_dir / "run.json", run.model_dump(mode="json"))
-    summary = build_summary(run)
-    write_json(run_dir / "summary.json", summary)
-    write_text(run_dir / "summary.md", summary_markdown(summary, experiment.description))
-
-    # Update the SQLite index.
-    conn = connect(layout.index_db)
-    try:
-        index_run_dir(conn, run_dir)
-    finally:
-        conn.close()
-
-    return run
-
-
-def _run_variant(
-    experiment: Experiment,
-    variant: Variant,
-    layout: Layout,
-    run_id: str,
-    invoker: Invoker,
-    session_state_root: Path | None,
-    github_token: str | None = None,
-    progress: Callable[[str], None] | None = None,
-) -> VariantResult:
-    vr = VariantResult(variant=variant)
-    for task_slug, task in experiment.iter_tasks():
-        vr.tasks.append(
-            _run_task(
-                experiment,
-                variant,
-                task_slug,
-                task,
-                layout,
-                run_id,
-                invoker,
-                session_state_root,
-                github_token,
-                progress,
-            )
-        )
-    return vr
-
-
-def _run_task(
-    experiment: Experiment,
-    variant: Variant,
-    task_slug: str,
-    task: Task,
-    layout: Layout,
-    run_id: str,
-    invoker: Invoker,
-    session_state_root: Path | None,
-    github_token: str | None = None,
-    progress: Callable[[str], None] | None = None,
-) -> TaskResult:
-    _report(progress, f"variant {variant.slug} / task {task_slug}: {variant.trials} trial(s)")
-    task_dir = layout.task_dir(experiment.slug, run_id, variant.slug, task_slug)
-    task_dir.mkdir(parents=True, exist_ok=True)
-    write_json(task_dir / "task.json", task.model_dump(mode="json", exclude_none=True))
-
-    tr = TaskResult(
-        task_slug=task_slug,
-        task_name=task.name,
-        prompt=task.prompt,
-    )
-    for trial_no in range(1, variant.trials + 1):
-        tr.trials.append(
-            _run_trial(
-                experiment,
-                variant,
-                task_slug,
-                task,
-                trial_no,
-                layout,
-                run_id,
-                invoker,
-                session_state_root,
-                github_token,
-                progress,
-            )
-        )
-    return tr
-
-
-def _run_trial(
-    experiment: Experiment,
-    variant: Variant,
-    task_slug: str,
-    task: Task,
-    trial_no: int,
-    layout: Layout,
-    run_id: str,
-    invoker: Invoker,
-    session_state_root: Path | None,
-    github_token: str | None = None,
-    progress: Callable[[str], None] | None = None,
-) -> TrialResult:
-    tag = f"{variant.slug}/{task_slug}/{trial_no:03d}"
-    trial_dir = layout.trial_dir(experiment.slug, run_id, variant.slug, task_slug, trial_no)
-    trial_dir.mkdir(parents=True, exist_ok=True)
-    workspace = trial_dir / "workspace"
-    # ``stdout.txt``: the raw combined stdout/stderr of the copilot process (plain text,
-    # which is what an auth/usage error actually is). ``session.md``: Copilot's own
-    # markdown transcript (``--share``). ``events.jsonl`` (copied below) stays the
-    # structured data source.
-    stdout_path = trial_dir / "stdout.txt"
-    share_path = trial_dir / "session.md"
-    # Copilot's own --log-dir debug log is large (megabytes) and echoes masked auth
-    # material; keep it in an ephemeral temp dir so it never lands under results/.
-    # The session events.jsonl (copied below) is our real data source -- see ADR-0010.
-    log_dir = Path(tempfile.mkdtemp(prefix="copilot-log-"))
-
-    session_id = new_session_id()
-    metrics = Metrics()
-    success: bool | None = None
-    exit_code = -1
-    duration_s = 0.0
-    status: TrialStatus = "ok"
-    error: str | None = None
-    error_artifact: str | None = None
-
-    try:
-        write_text(trial_dir / "prompt.md", task.prompt)
-        provision(task, workspace, layout.root)
-        _report(progress, f"[{tag}] workspace provisioned -> {workspace}")
-
-        env_overrides: dict[str, str] = {}
-        if github_token:
-            env_overrides[INJECTED_TOKEN_ENV_VAR] = github_token
-        otel_path = trial_dir / "copilot-otel.jsonl"
-        _configure_otel_env(
-            env_overrides,
-            variant,
-            session_id=session_id,
-            otel_path=otel_path,
-            experiment_slug=experiment.slug,
-            task_slug=task_slug,
-            trial_no=trial_no,
-        )
-        inv = Invocation(
-            prompt=task.prompt,
-            workspace=workspace,
-            session_id=session_id,
-            variant=variant,
-            log_dir=log_dir,
-            stdout_path=stdout_path,
-            session_state_root=session_state_root or _default_session_state_root(),
-            env_overrides=env_overrides,
-            share_path=share_path,
-            secret_env_names=secret_env_names(
-                variant.env, byok_secrets=variant.provider is not None
-            ),
-        )
-        _report(progress, f"[{tag}] invoking copilot (session {session_id})")
-        result = invoker.run(inv)
-        exit_code = result.exit_code
-        duration_s = result.duration_s
-        _report(
-            progress,
-            f"[{tag}] copilot exited {exit_code} in {duration_s:.1f}s",
-        )
-
-        # Collect the session events and parse metrics.
-        copy_events(session_id, trial_dir / "events.jsonl", inv.session_state_root)
-        events = load_events(trial_dir / "events.jsonl")
-        metrics = parse_metrics(events)
-        if metrics.duration_s is None:
-            metrics.duration_s = round(duration_s, 3)
-        _report(
-            progress,
-            f"[{tag}] session log: {len(events)} events -> {metrics.n_turns} turns, "
-            f"{metrics.n_tool_calls} tool calls, {metrics.total_tokens or 0} tokens",
-        )
-
-        # Build and persist the richer session analysis (timeline, tool histogram).
-        otel_records = load_events(otel_path) if otel_path.exists() else None
-        analysis = analyze_events(events, otel_records)
-        write_json(trial_dir / "analysis.json", analysis.model_dump(mode="json"))
-
-        # Capture what changed in the workspace.
-        write_text(trial_dir / "workspace.diff", capture_diff(workspace))
-
-        # Run the verification command, if any.
-        if task.verify:
-            code, output = run_shell(task.verify, workspace)
-            success = code == 0
-            write_json(
-                trial_dir / "verify.json",
-                {"command": task.verify, "exit_code": code, "success": success, "output": output},
-            )
-            _report(progress, f"[{tag}] verify: {'pass' if success else 'fail'} (exit {code})")
-
-        # Copilot ran, but did it actually do anything? A non-zero exit or an empty
-        # session log (0 turns) means it never really started -- an infra/harness
-        # problem (bad auth, bad working dir), not the experiment failing on merit.
-        no_session_log = len(events) == 0 and metrics.n_turns == 0
-        if exit_code != 0 or no_session_log:
-            status = "copilot_failed"
-            reasons = []
-            if exit_code != 0:
-                reasons.append(f"copilot exited {exit_code}")
-            if no_session_log:
-                reasons.append("no session log captured (0 turns)")
-            error = "; ".join(reasons)
-            error_artifact = stdout_path.name
-            _report(progress, f"[{tag}] copilot did not run cleanly: {error}")
-    except Exception as exc:  # noqa: BLE001 - any pipeline failure is a harness error
-        status = "harness_error"
-        error = f"{type(exc).__name__}: {exc}"
-        error_artifact = stdout_path.name if stdout_path.exists() else None
-        _report(progress, f"[{tag}] harness error: {error}")
-    finally:
-        force_rmtree(log_dir)
-
-    trial = TrialResult(
-        trial_no=trial_no,
-        session_id=session_id,
-        exit_code=exit_code,
-        duration_s=round(duration_s, 3),
-        success=success,
-        metrics=metrics,
-        status=status,
-        error=error,
-        error_artifact=error_artifact,
-    )
-    write_json(
-        trial_dir / "meta.json",
-        {
-            "trial_no": trial_no,
-            "session_id": session_id,
-            "exit_code": exit_code,
-            "duration_s": trial.duration_s,
-            "success": success,
-            "status": status,
-            "error": error,
-            "error_artifact": error_artifact,
-            "workspace": str(workspace),
-        },
-    )
-    write_json(trial_dir / "metrics.json", metrics.model_dump(mode="json"))
-    return trial
-
-
-def _default_session_state_root() -> Path:
-    from .sessionlog import session_state_root
-
-    return session_state_root()
-
-
-def _configure_otel_env(
-    env_overrides: dict[str, str],
-    variant: Variant,
-    *,
-    session_id: str,
-    otel_path: Path,
-    experiment_slug: str,
-    task_slug: str,
-    trial_no: int,
-) -> None:
-    if not _otel_destination_configured(env_overrides, variant):
-        env_overrides["COPILOT_OTEL_FILE_EXPORTER_PATH"] = str(otel_path.resolve())
-    if not _otel_env_active(env_overrides, variant):
-        return
-    _setdefault_child_env(env_overrides, variant, "COPILOT_OTEL_SOURCE_NAME", "copilot-experiments")
-    _setdefault_child_env(env_overrides, variant, "OTEL_SERVICE_NAME", "copilot-experiments")
-    _append_otel_resource_attributes(
-        env_overrides,
-        variant,
-        {
-            "copilot.session_id": session_id,
-            "copilot.experiment": experiment_slug,
-            "copilot.variant": variant.slug,
-            "copilot.task": task_slug,
-            "copilot.trial": f"{trial_no:03d}",
-        },
-    )
-
-
-def _env_value(env_overrides: dict[str, str], variant: Variant, name: str) -> str | None:
-    return env_overrides.get(name) or variant.env.get(name) or os.environ.get(name)
-
-
-def _setdefault_child_env(
-    env_overrides: dict[str, str], variant: Variant, name: str, value: str
-) -> None:
-    if _env_value(env_overrides, variant, name) is None:
-        env_overrides[name] = value
-
-
-def _otel_destination_configured(env_overrides: dict[str, str], variant: Variant) -> bool:
-    return bool(
-        _env_value(env_overrides, variant, "COPILOT_OTEL_FILE_EXPORTER_PATH")
-        or _env_value(env_overrides, variant, "OTEL_EXPORTER_OTLP_ENDPOINT")
-    )
-
-
-def _otel_env_active(env_overrides: dict[str, str], variant: Variant) -> bool:
-    return bool(
-        _env_value(env_overrides, variant, "COPILOT_OTEL_ENABLED")
-        or _env_value(env_overrides, variant, "COPILOT_OTEL_FILE_EXPORTER_PATH")
-        or _env_value(env_overrides, variant, "OTEL_EXPORTER_OTLP_ENDPOINT")
-    )
-
-
-def _append_otel_resource_attributes(
-    env_overrides: dict[str, str], variant: Variant, attributes: dict[str, str]
-) -> None:
-    existing = _env_value(env_overrides, variant, "OTEL_RESOURCE_ATTRIBUTES") or ""
-    existing_keys = {
-        part.split("=", 1)[0].strip()
-        for part in existing.split(",")
-        if "=" in part and part.split("=", 1)[0].strip()
-    }
-    additions = [f"{key}={value}" for key, value in attributes.items() if key not in existing_keys]
-    if additions:
-        env_overrides["OTEL_RESOURCE_ATTRIBUTES"] = ",".join(
-            [part for part in (existing, *additions) if part]
-        )
-
-
-# --------------------------------------------------------------------------- #
-# Dry-run: validate the whole pipeline, persist nothing
-# --------------------------------------------------------------------------- #
-def dry_run_experiment(
-    experiment: Experiment,
-    *,
-    root: Path | None = None,
-    invoker: Invoker | None = None,
-) -> DryRunReport:
-    """Validate the full run pipeline without leaving anything behind.
-
-    Runs every stage with a mock invoker inside a throwaway temp directory,
-    asserts that each stage produced its artifact, then deletes the temp dir.
-    Fixtures are still read from ``root``; only the *outputs* are redirected.
-    Returns a :class:`DryRunReport` -- nothing is persisted under ``root``.
-    """
-    root = Path(root or Path.cwd())
-    tmp = Path(tempfile.mkdtemp(prefix="copilot-exp-dryrun-"))
-    try:
-        run = run_experiment(
-            experiment,
-            root=root,
-            invoker=invoker or MockInvoker(),
-            results_root=tmp,
-            session_state_root=tmp / ".session-state",
-        )
-        layout = Layout(root, results_root=tmp)
-        checks = _validate_plumbing(layout, experiment, run)
-        return DryRunReport(experiment=experiment.name, checks=checks)
-    finally:
-        force_rmtree(tmp)
-
-
-def _check(name: str, ok: bool, detail: str = "") -> DryRunCheck:
-    return DryRunCheck(name=name, ok=ok, detail=detail)
-
-
-def _validate_plumbing(
-    layout: Layout, experiment: Experiment, run: ExperimentRun
-) -> list[DryRunCheck]:
-    """Inspect the on-disk artifacts of the first trial (and the run) and report
-    whether each pipeline stage actually did its job."""
-    checks: list[DryRunCheck] = []
-    variant = experiment.variants[0]
-    task_slug, task = experiment.iter_tasks()[0]
-    run_dir = layout.run_dir(experiment.slug, run.run_id)
-    trial_dir = layout.trial_dir(experiment.slug, run.run_id, variant.slug, task_slug, 1)
-    workspace = trial_dir / "workspace"
-
-    # 1. Workspace provisioned with a git baseline.
-    head = _git_head(workspace) if workspace.exists() else None
-    checks.append(
-        _check(
-            "workspace provisioned",
-            workspace.exists() and head is not None,
-            f"git baseline {head[:10]}" if head else "no workspace / git HEAD",
-        )
-    )
-
-    # 2. Session log captured and parseable.
-    events_path = trial_dir / "events.jsonl"
-    n_events = 0
-    if events_path.exists():
-        try:
-            n_events = len(load_events(events_path))
-        except Exception:  # pragma: no cover - defensive
-            n_events = 0
-    checks.append(
-        _check("session log captured", events_path.exists() and n_events >= 1, f"{n_events} events")
-    )
-
-    # 3. OTel file captured for per-call economics.
-    otel_path = trial_dir / "copilot-otel.jsonl"
-    n_otel = 0
-    if otel_path.exists():
-        try:
-            n_otel = len(load_events(otel_path))
-        except Exception:  # pragma: no cover - defensive
-            n_otel = 0
-    checks.append(_check("otel captured", otel_path.exists() and n_otel >= 1, f"{n_otel} records"))
-
-    # 4. Metrics parsed from the session log.
-    metrics_path = trial_dir / "metrics.json"
-    n_turns = int(read_json(metrics_path).get("n_turns") or 0) if metrics_path.exists() else 0
-    checks.append(
-        _check("metrics parsed", metrics_path.exists() and n_turns >= 1, f"{n_turns} turns")
-    )
-
-    # 5. Session analysis written.
-    checks.append(_check("analysis written", (trial_dir / "analysis.json").exists()))
-
-    # 6. Workspace diff captured and non-empty -- this is what caught the MAX_PATH bug.
-    diff_path = trial_dir / "workspace.diff"
-    diff = diff_path.read_text(encoding="utf-8") if diff_path.exists() else ""
-    checks.append(
-        _check(
-            "workspace diff captured",
-            diff.strip() != "",
-            f"{len(diff)} bytes" if diff.strip() else "empty diff (invoker changed nothing?)",
-        )
-    )
-
-    # 6. Verification ran (we only assert it ran, not that it passed).
-    if task.verify:
-        checks.append(_check("verify ran", (trial_dir / "verify.json").exists()))
-
-    # 7. Run-level summary written.
-    checks.append(
-        _check(
-            "run summary written",
-            (run_dir / "summary.json").exists() and (run_dir / "summary.md").exists(),
-        )
-    )
-
-    # 7b. Task axis present on disk (variants/<v>/tasks/<task>/...).
-    checks.append(
-        _check(
-            "task dir present",
-            layout.task_dir(experiment.slug, run.run_id, variant.slug, task_slug).is_dir(),
-            f"tasks/{task_slug}",
-        )
-    )
-
-    # 8. Run recorded in the SQLite index.
-    indexed = False
-    if layout.index_db.exists():
-        conn = connect(layout.index_db)
-        try:
-            row = conn.execute("SELECT 1 FROM runs WHERE run_id = ?", (run.run_id,)).fetchone()
-            indexed = row is not None
-        finally:
-            conn.close()
-    checks.append(_check("indexed", indexed))
-
-    return checks
diff --git a/src/copilot_experiments/storage.py b/src/copilot_experiments/storage.py
index 9b95785..fddb5f0 100644
--- a/src/copilot_experiments/storage.py
+++ b/src/copilot_experiments/storage.py
@@ -1,16 +1,13 @@
-"""Filesystem layout for experiment results.
+"""Filesystem layout for Pier-first experiment repositories.
 
-The Pier refactor makes ``jobs/`` the primary execution output. The previous
-``results/`` tree is still supported for legacy Python experiments and for the
-derived SQLite index.
-
-Pier layout (inside an experiment repository)::
+The filesystem is the source of truth. Concrete Pier runs live under::
 
     jobs/
       <job-name>/
         <run-id>/
           config.json
           result.json
+          copilot-experiments-run.json
           <trial-name>/
             config.json
             result.json
@@ -21,35 +18,6 @@
               copilot-session/**/events.jsonl
             verifier/
             artifacts/
-
-Legacy layout (inside an experiment repository)::
-
-    results/
-      index.db                                  # SQLite cross-run index
-      <experiment-slug>/
-        <run-id>/
-          run.json                              # run manifest
-          summary.json                          # aggregated metrics
-          summary.md                            # human-readable report
-          variants/
-            <variant-slug>/
-              variant.json                      # variant config (secrets redacted)
-              tasks/
-                <task-slug>/
-                  task.json                     # task config (prompt, fixture, verify)
-                  trials/
-                    <NNN>/
-                      meta.json                 # session id, exit code, duration, success, status
-                      prompt.md                 # exact prompt sent
-                      stdout.txt                # raw copilot stdout/stderr (diagnostics)
-                      session.md                # copilot's markdown transcript (--share)
-                      events.jsonl              # copied session events (structured source)
-                      copilot-otel.jsonl         # OTel spans/metrics (per-call economics)
-                      metrics.json              # parsed metrics
-                      analysis.json             # richer session analysis
-                      workspace.diff            # git diff of the workspace
-                      verify.json               # verification result (if any)
-                      workspace/                # the trial's working directory
 """
 
 from __future__ import annotations
@@ -60,150 +28,74 @@
 
 
 class Layout:
-    """Resolves the standard result paths for an experiment repository.
+    """Resolve the standard paths for a Pier experiment repository."""
 
-    ``root`` is where the experiment definitions and fixtures live. ``results_root``
-    is where run artifacts are *written*; it defaults to ``root/results`` but can be
-    pointed elsewhere (e.g. a throwaway temp dir for an ephemeral dry-run) so that
-    reading fixtures and writing results are decoupled.
-    """
-
-    def __init__(self, root: Path, *, results_root: Path | None = None) -> None:
+    def __init__(self, root: Path) -> None:
         self.root = Path(root)
-        self._results_root = Path(results_root) if results_root is not None else None
-
-    @property
-    def results_dir(self) -> Path:
-        return self._results_root if self._results_root is not None else self.root / "results"
 
     @property
     def jobs_dir(self) -> Path:
         return self.root / "jobs"
 
-    @property
-    def index_db(self) -> Path:
-        return self.results_dir / "index.db"
-
     @property
     def experiments_dir(self) -> Path:
         return self.root / "experiments"
 
-    def experiment_dir(self, experiment_slug: str) -> Path:
-        return self.results_dir / experiment_slug
-
-    def run_dir(self, experiment_slug: str, run_id: str) -> Path:
-        return self.experiment_dir(experiment_slug) / run_id
-
-    def variant_dir(self, experiment_slug: str, run_id: str, variant_slug: str) -> Path:
-        return self.run_dir(experiment_slug, run_id) / "variants" / variant_slug
-
-    def task_dir(
-        self, experiment_slug: str, run_id: str, variant_slug: str, task_slug: str
-    ) -> Path:
-        return self.variant_dir(experiment_slug, run_id, variant_slug) / "tasks" / task_slug
-
-    def trial_dir(
-        self,
-        experiment_slug: str,
-        run_id: str,
-        variant_slug: str,
-        task_slug: str,
-        trial_no: int,
-    ) -> Path:
-        return (
-            self.task_dir(experiment_slug, run_id, variant_slug, task_slug)
-            / "trials"
-            / f"{trial_no:03d}"
-        )
-
-    # --- discovery helpers ------------------------------------------------- #
-    def iter_runs(self) -> list[tuple[str, str, Path]]:
-        """Yield ``(experiment_slug, run_id, run_dir)`` for every stored run."""
-        runs: list[tuple[str, str, Path]] = []
-        if not self.results_dir.exists():
-            return runs
-        for exp_dir in sorted(p for p in self.results_dir.iterdir() if p.is_dir()):
-            for run_dir in sorted(p for p in exp_dir.iterdir() if p.is_dir()):
-                if (run_dir / "run.json").exists():
-                    runs.append((exp_dir.name, run_dir.name, run_dir))
-        return runs
-
-    def find_run(self, run_id: str) -> Path | None:
-        """Locate a run directory by exact id or unique prefix."""
-        matches = [rd for _, rid, rd in self.iter_runs() if rid == run_id]
-        if matches:
-            return matches[0]
-        prefix = [rd for _, rid, rd in self.iter_runs() if rid.startswith(run_id)]
-        return prefix[0] if len(prefix) == 1 else None
-
-    def latest_run(self) -> Path | None:
-        runs = self.iter_runs()
-        return runs[-1][2] if runs else None
-
-    # --- Pier discovery helpers ------------------------------------------- #
     def iter_pier_jobs(self) -> list[Path]:
-        """Yield Pier run directories under ``jobs/``.
-
-        New runs live at ``jobs/<job-name>/<run-id>/``. Pre-migration flat
-        ``jobs/<job-name>/`` directories are still recognized for existing data.
-        A Pier run directory is identified by the stable pair ``config.json`` and
-        ``result.json``. The SQLite index remains under ``results/`` because it
-        is a derived cache owned by this project, not by Pier.
-        """
+        """Return concrete Pier run directories under ``jobs/<job>/<run-id>``."""
 
         if not self.jobs_dir.exists():
             return []
-        found: list[Path] = []
-        for path in sorted(p for p in self.jobs_dir.iterdir() if p.is_dir()):
-            is_flat_job = self._is_pier_job_dir(path)
-            if is_flat_job:
-                found.append(path)
-            found.extend(
-                child
-                for child in sorted(p for p in path.iterdir() if p.is_dir())
-                if self._is_pier_job_dir(child)
-                and (not is_flat_job or (child / PIER_RUN_MANIFEST).exists())
+        runs: list[Path] = []
+        for job_group in sorted(path for path in self.jobs_dir.iterdir() if path.is_dir()):
+            runs.extend(
+                run_dir
+                for run_dir in sorted(path for path in job_group.iterdir() if path.is_dir())
+                if self._is_pier_run_dir(run_dir)
             )
-        return sorted(found, key=self._pier_job_sort_key)
+        return sorted(runs, key=self._pier_run_sort_key)
 
-    def find_pier_job(self, job_name: str) -> Path | None:
+    def find_pier_job(self, selector: str) -> Path | None:
         """Locate a Pier run by job name, run id, ``job/run`` id, or unique prefix."""
 
-        jobs = self.iter_pier_jobs()
-        group = self.jobs_dir / job_name
-        group_runs = [path for path in jobs if path.parent == group]
+        runs = self.iter_pier_jobs()
+
+        group = self.jobs_dir / selector
+        group_runs = [path for path in runs if path.parent == group]
         if group_runs:
             return group_runs[-1]
 
-        matches = [
-            path for path in jobs if path.name == job_name or self.pier_job_key(path) == job_name
+        exact = [
+            path for path in runs if path.name == selector or self.pier_job_key(path) == selector
         ]
-        if len(matches) == 1:
-            return matches[0]
+        if len(exact) == 1:
+            return exact[0]
+
         prefix = [
             path
-            for path in jobs
-            if path.name.startswith(job_name) or self.pier_job_key(path).startswith(job_name)
+            for path in runs
+            if path.name.startswith(selector) or self.pier_job_key(path).startswith(selector)
         ]
         return prefix[0] if len(prefix) == 1 else None
 
     def latest_pier_job(self) -> Path | None:
-        jobs = self.iter_pier_jobs()
-        return jobs[-1] if jobs else None
+        runs = self.iter_pier_jobs()
+        return runs[-1] if runs else None
 
     def pier_job_key(self, job_dir: Path) -> str:
-        """Return ``job/run`` for nested runs and the directory name for legacy flat jobs."""
+        """Return the stable ``job/run`` selector for a concrete Pier run directory."""
 
         job_dir = Path(job_dir)
-        if job_dir.parent.parent == self.jobs_dir:
-            return f"{job_dir.parent.name}/{job_dir.name}"
-        return job_dir.name
+        return f"{job_dir.parent.name}/{job_dir.name}"
 
     @staticmethod
-    def _is_pier_job_dir(path: Path) -> bool:
-        return (path / "config.json").exists() and (path / "result.json").exists()
+    def _is_pier_run_dir(path: Path) -> bool:
+        return (
+            (path / "config.json").exists()
+            and (path / "result.json").exists()
+            and (path / PIER_RUN_MANIFEST).exists()
+        )
 
-    def _pier_job_sort_key(self, path: Path) -> tuple[int, str, str]:
-        if path.parent.parent == self.jobs_dir:
-            return (1, path.name, path.parent.name)
-        return (0, path.name, path.name)
+    @staticmethod
+    def _pier_run_sort_key(path: Path) -> tuple[str, str]:
+        return (path.parent.name, path.name)
diff --git a/src/copilot_experiments/templates/experiment_repo/.apm/instructions/experiments.instructions.md b/src/copilot_experiments/templates/experiment_repo/.apm/instructions/experiments.instructions.md
index d217898..41fd876 100644
--- a/src/copilot_experiments/templates/experiment_repo/.apm/instructions/experiments.instructions.md
+++ b/src/copilot_experiments/templates/experiment_repo/.apm/instructions/experiments.instructions.md
@@ -8,7 +8,6 @@ applyTo: "**"
 - Experiments are Pier `JobConfig` YAML files in `experiments/*.yaml`.
 - Tasks live under `tasks/<name>/` as Harbor/Pier task directories.
 - Generated Pier job data lives under `jobs/` and must not be edited by hand.
-- Derived query data lives under `results/index.db` and can be rebuilt.
 
 When adding an experiment:
 1. Create a deterministic task directory under `tasks/`.
@@ -16,4 +15,4 @@ When adding an experiment:
 3. Define or update a Pier job YAML in `experiments/`.
 4. Use the local Copilot agent import path:
    `copilot_experiments.pier_agents.copilot_cli:CopilotCli`.
-5. Validate configs with `copilot-experiments run --dry-run` before a real run.
+5. Validate configs with `copilot-experiments validate` before a real run.
diff --git a/src/copilot_experiments/templates/experiment_repo/.apm/prompts/new-experiment.prompt.md b/src/copilot_experiments/templates/experiment_repo/.apm/prompts/new-experiment.prompt.md
index f7a8a04..27b8c0d 100644
--- a/src/copilot_experiments/templates/experiment_repo/.apm/prompts/new-experiment.prompt.md
+++ b/src/copilot_experiments/templates/experiment_repo/.apm/prompts/new-experiment.prompt.md
@@ -11,6 +11,6 @@ Given a task description from the user:
    `instruction.md`, `environment/`, and `tests/test.sh`.
 2. Add `experiments/<slug>.yaml` defining a Pier `JobConfig` with the `copilot-cli` agent, model
    settings, attempts, and artifacts.
-3. Validate with `copilot-experiments run --dry-run` and fix any errors.
+3. Validate with `copilot-experiments validate` and fix any errors.
 
 Ask for the model matrix and number of attempts if not provided.
diff --git a/src/copilot_experiments/templates/experiment_repo/.apm/skills/analyzing-results/SKILL.md b/src/copilot_experiments/templates/experiment_repo/.apm/skills/analyzing-results/SKILL.md
index 171c241..3f5a3c3 100644
--- a/src/copilot_experiments/templates/experiment_repo/.apm/skills/analyzing-results/SKILL.md
+++ b/src/copilot_experiments/templates/experiment_repo/.apm/skills/analyzing-results/SKILL.md
@@ -1,9 +1,9 @@
 ---
 name: analyzing-results
 description: >-
-  Use when analyzing GitHub Copilot experiment results: comparing variants,
+  Use when analyzing GitHub Copilot experiment results: comparing agents,
   measuring success rates and cost-effectiveness, and inspecting session logs to
-  identify failures. Covers Pier jobs and the derived SQLite index.
+  identify failures. Covers Pier jobs and derived summaries.
 ---
 
 # Analyzing results
@@ -31,19 +31,10 @@ tokens, and AIU economics.
 ## CLI
 ```bash
 copilot-experiments list                 # runs + success rates
-copilot-experiments show --last          # per-variant comparison table
+copilot-experiments show --last          # per-agent comparison table
 copilot-experiments inspect <job-name>   # latest run for that Pier job
 copilot-experiments inspect <job-name>/<run-id>  # exact run selector from list
-copilot-experiments analyze --last       # render native Copilot events
-copilot-experiments reindex              # rebuild results/index.db
-```
-
-## SQLite (results/index.db)
-Tables include legacy `experiments`, `runs`, `variants`, `tasks`, `trials` plus Pier
-`pier_jobs` and `pier_trials`. Useful queries:
-```sql
-SELECT model, AVG(success) AS success_rate, COUNT(*) AS n
-FROM pier_trials WHERE success IS NOT NULL GROUP BY model ORDER BY success_rate DESC;
+copilot-experiments analyze <job-name>/<run-id> --agent <agent> --trial <n>
 ```
 
 ## Diagnosing failures
diff --git a/src/copilot_experiments/templates/experiment_repo/.apm/skills/authoring-experiments/SKILL.md b/src/copilot_experiments/templates/experiment_repo/.apm/skills/authoring-experiments/SKILL.md
index a748f0a..b3759ca 100644
--- a/src/copilot_experiments/templates/experiment_repo/.apm/skills/authoring-experiments/SKILL.md
+++ b/src/copilot_experiments/templates/experiment_repo/.apm/skills/authoring-experiments/SKILL.md
@@ -44,5 +44,5 @@ copilot-experiments deepswe-import vendor/deep-swe --n-tasks 3 --sample-seed 0
 
 ## Validate
 ```bash
-copilot-experiments run --dry-run   # validates configs, no credits
+copilot-experiments validate   # validates configs, paths, auth, and backend setup
 ```
diff --git a/src/copilot_experiments/templates/experiment_repo/AGENTS.md.tmpl b/src/copilot_experiments/templates/experiment_repo/AGENTS.md.tmpl
index 87db7cc..d3a27fb 100644
--- a/src/copilot_experiments/templates/experiment_repo/AGENTS.md.tmpl
+++ b/src/copilot_experiments/templates/experiment_repo/AGENTS.md.tmpl
@@ -9,21 +9,20 @@ This repository contains **GitHub Copilot research experiments** built with the
 - `tasks/<name>/` — Harbor/Pier task directories (`task.toml`, `instruction.md`, `environment/`,
   `tests/`).
 - `jobs/` — Pier job outputs. **Never edit by hand**; they are canonical run artifacts.
-- `results/` — derived SQLite index. Regenerate with `copilot-experiments reindex`.
 - `.apm/` — APM-managed agent context (instructions, skills, prompts).
 
 ## Authoring an experiment
 1. Add a deterministic Pier task under `tasks/<name>/`.
 2. Create `experiments/<name>.yaml` with a Pier `JobConfig` using `name: copilot-cli` for
    GitHub Copilot CLI runs.
-3. Validate config loading with `uv run copilot-experiments run --dry-run`.
+3. Validate config loading and environment preflights with `uv run copilot-experiments validate`.
 
 ## Conventions
 - Keep instructions and task environments deterministic and self-contained.
 - Verifier scripts should write `/logs/verifier/reward.txt` or `reward.json` and exit non-zero on
   failure.
 - Use `n_attempts > 1` when you need statistical robustness.
-- Do not commit `jobs/` or `results/` (they are gitignored).
+- Do not commit `jobs/` (it is gitignored).
 - Keep code ruff-formatted/linted and add focused tests for behavior changes.
 
 ## Commands
@@ -34,9 +33,10 @@ If you are using a local checkout of the unpublished harness, replace
 
 ```bash
 uv sync
-uv run copilot-experiments run [--dry-run]
+uv run copilot-experiments validate
+uv run copilot-experiments run
 uv run copilot-experiments list
 uv run copilot-experiments show --last
-uv run copilot-experiments inspect <job-name>/<run-id> --trial <n>
-uv run copilot-experiments analyze <job-name>/<run-id> --trial <n>
+uv run copilot-experiments inspect <job-name>/<run-id> --agent <agent> --trial <n>
+uv run copilot-experiments analyze <job-name>/<run-id> --agent <agent> --trial <n>
 ```
diff --git a/src/copilot_experiments/templates/experiment_repo/README.md.tmpl b/src/copilot_experiments/templates/experiment_repo/README.md.tmpl
index 424cc7a..45b18ec 100644
--- a/src/copilot_experiments/templates/experiment_repo/README.md.tmpl
+++ b/src/copilot_experiments/templates/experiment_repo/README.md.tmpl
@@ -9,7 +9,6 @@ GitHub Copilot research experiments, powered by
 experiments/        # Pier JobConfig YAML files
 tasks/              # Harbor/Pier task directories
 jobs/               # Pier job/run outputs (gitignored)
-results/            # derived SQLite index for queries (gitignored)
 .apm/               # APM agent context (instructions, skills, prompts)
 ```
 
@@ -22,8 +21,8 @@ experiment repo:
 ```bash
 export COPILOT_EXPERIMENTS_REPO=/path/to/github-copilot-lab
 
-# validate Pier job configs without starting Docker or spending credits
-uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments run --dry-run
+# validate Pier job configs, paths, auth, and backend setup
+uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments validate
 
 # run for real through Pier (requires Copilot auth and a supported Pier backend)
 uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments run
@@ -32,8 +31,8 @@ uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments show --last
 
 # explore results
 uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments list
-uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments inspect <job-name>/<run-id> --trial 1
-uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments analyze <job-name>/<run-id> --trial 1
+uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments inspect <job-name>/<run-id> --agent copilot-cli --trial 1
+uvx --from "$COPILOT_EXPERIMENTS_REPO" copilot-experiments analyze <job-name>/<run-id> --agent copilot-cli --trial 1
 ```
 
 In PowerShell, use
@@ -51,8 +50,8 @@ uv sync
 git clone https://github.com/datacurve-ai/deep-swe vendor/deep-swe
 uv run copilot-experiments deepswe-import vendor/deep-swe --n-tasks 3 --sample-seed 0
 
-# validate Pier job configs without starting Docker or spending credits
-uv run copilot-experiments run --dry-run
+# validate Pier job configs, paths, auth, and backend setup
+uv run copilot-experiments validate
 
 # run for real through Pier (requires Copilot auth and a supported Pier backend)
 uv run copilot-experiments run
@@ -61,8 +60,8 @@ uv run copilot-experiments show --last
 
 # explore results
 uv run copilot-experiments list
-uv run copilot-experiments inspect <job-name>/<run-id> --trial 1
-uv run copilot-experiments analyze <job-name>/<run-id> --trial 1
+uv run copilot-experiments inspect <job-name>/<run-id> --agent copilot-cli --trial 1
+uv run copilot-experiments analyze <job-name>/<run-id> --agent copilot-cli --trial 1
 ```
 
 ## Writing experiments
diff --git a/src/copilot_experiments/workspace.py b/src/copilot_experiments/workspace.py
deleted file mode 100644
index c26fbde..0000000
--- a/src/copilot_experiments/workspace.py
+++ /dev/null
@@ -1,116 +0,0 @@
-"""Provision isolated per-trial workspaces and capture their diffs."""
-
-from __future__ import annotations
-
-import shutil
-import subprocess
-from pathlib import Path
-
-from .models import Task
-
-# Applied to every git invocation. ``core.longpaths=true`` lets git write objects
-# under the deep ``results/.../trials/NNN/workspace/.git`` tree on Windows, where the
-# baseline commit and diff would otherwise fail with "Filename too long" (MAX_PATH).
-_GIT_CONFIG = ["-c", "core.longpaths=true"]
-_GIT_IDENTITY = [
-    "-c",
-    "user.email=copilot-experiments@example.com",
-    "-c",
-    "user.name=copilot-experiments",
-]
-
-
-class WorkspaceError(RuntimeError):
-    pass
-
-
-def _git(args: list[str], cwd: Path) -> subprocess.CompletedProcess:
-    return subprocess.run(
-        ["git", *_GIT_CONFIG, *args],
-        cwd=str(cwd),
-        capture_output=True,
-        text=True,
-    )
-
-
-def _git_checked(args: list[str], cwd: Path) -> subprocess.CompletedProcess:
-    """Run git and raise :class:`WorkspaceError` on a non-zero exit.
-
-    Used for the baseline/diff plumbing so a silent git failure (e.g. an
-    unwritable object store) can never masquerade as "no changes".
-    """
-    proc = _git(args, cwd)
-    if proc.returncode != 0:
-        detail = (proc.stderr or proc.stdout or "").strip()
-        raise WorkspaceError(f"git {' '.join(args)} failed (exit {proc.returncode}): {detail}")
-    return proc
-
-
-def run_shell(command: str, cwd: Path, env: dict[str, str] | None = None) -> tuple[int, str]:
-    """Run a shell command in ``cwd``; return (exit_code, combined_output)."""
-    proc = subprocess.run(
-        command,
-        cwd=str(cwd),
-        # Task setup/verify commands are intentionally authored as shell snippets.
-        shell=True,  # nosec B602
-        capture_output=True,
-        text=True,
-        env=env,
-    )
-    return proc.returncode, (proc.stdout or "") + (proc.stderr or "")
-
-
-def provision(task: Task, workspace: Path, repo_root: Path) -> Path:
-    """Create the starting workspace for a trial and commit a git baseline.
-
-    The baseline commit lets us compute a clean diff of whatever Copilot changes.
-    """
-    workspace.mkdir(parents=True, exist_ok=True)
-
-    if task.fixture and task.repo:
-        raise WorkspaceError("Task defines both 'fixture' and 'repo'; choose one.")
-
-    if task.fixture:
-        src = (repo_root / task.fixture).resolve()
-        if not src.is_dir():
-            raise WorkspaceError(f"Fixture directory not found: {src}")
-        shutil.copytree(src, workspace, dirs_exist_ok=True)
-    elif task.repo:
-        proc = _git(["clone", "--quiet", task.repo, "."], workspace)
-        if proc.returncode != 0:
-            raise WorkspaceError(f"git clone failed: {proc.stderr.strip()}")
-        if task.ref:
-            proc = _git(["checkout", "--quiet", task.ref], workspace)
-            if proc.returncode != 0:
-                raise WorkspaceError(f"git checkout {task.ref} failed: {proc.stderr.strip()}")
-
-    # Establish a git baseline so diffing is reliable. These steps are checked:
-    # a silent failure here (historically: Windows MAX_PATH on the deep results
-    # tree) would leave no HEAD and make every diff come back empty.
-    if not (workspace / ".git").exists():
-        _git_checked(["init", "--quiet"], workspace)
-    _git_checked(["add", "-A"], workspace)
-    _git_checked(
-        [*_GIT_IDENTITY, "commit", "--quiet", "--allow-empty", "-m", "baseline"], workspace
-    )
-
-    for command in task.setup:
-        code, output = run_shell(command, workspace)
-        if code != 0:
-            raise WorkspaceError(f"setup command failed ({command!r}): {output.strip()}")
-
-    return workspace
-
-
-def capture_diff(workspace: Path) -> str:
-    """Return a unified diff of all changes since the baseline commit.
-
-    Returns ``""`` only when the workspace has no git baseline at all. If a
-    baseline exists but git fails, a :class:`WorkspaceError` is raised rather
-    than silently reporting "no changes".
-    """
-    if not (workspace / ".git").exists():
-        return ""
-    _git_checked(["add", "-A"], workspace)
-    proc = _git_checked(["diff", "--cached", "HEAD"], workspace)
-    return proc.stdout or ""
diff --git a/tests/conftest.py b/tests/conftest.py
deleted file mode 100644
index 813c9ec..0000000
--- a/tests/conftest.py
+++ /dev/null
@@ -1,68 +0,0 @@
-"""Shared pytest fixtures for the copilot-experiments test suite."""
-
-from __future__ import annotations
-
-import sys
-from pathlib import Path
-
-import pytest
-
-from copilot_experiments import Experiment, Task, Variant
-
-FIXTURES = Path(__file__).parent / "fixtures"
-
-# A portable verify command: succeeds only when a SOLVED marker exists.
-_VERIFY = f'"{sys.executable}" -c "import os,sys; sys.exit(0 if os.path.exists(\'SOLVED\') else 1)"'
-
-
-@pytest.fixture
-def repo_root(tmp_path: Path) -> Path:
-    """A throwaway experiment-repo root with the sample fixture copied in."""
-    fixtures = tmp_path / "fixtures" / "sample_task"
-    fixtures.mkdir(parents=True)
-    (fixtures / "seed.txt").write_text("seed\n", encoding="utf-8")
-    return tmp_path
-
-
-@pytest.fixture
-def experiment() -> Experiment:
-    return Experiment(
-        name="Sample Experiment",
-        description="A tiny experiment used by the test suite.",
-        task=Task(
-            prompt="Create a SOLVED file.",
-            fixture="fixtures/sample_task",
-            verify=_VERIFY,
-        ),
-        variants=[
-            Variant(name="alpha", model="model-a"),
-            Variant(name="beta", model="model-b", trials=2),
-        ],
-    )
-
-
-@pytest.fixture
-def multitask_experiment() -> Experiment:
-    """A 2-task x 2-variant experiment exercising the task suite axis."""
-    return Experiment(
-        name="Suite Experiment",
-        description="A two-task suite used by the test suite.",
-        tasks=[
-            Task(
-                name="First Task",
-                prompt="Create a SOLVED file.",
-                fixture="fixtures/sample_task",
-                verify=_VERIFY,
-            ),
-            Task(
-                name="Second Task",
-                prompt="Create a SOLVED file.",
-                fixture="fixtures/sample_task",
-                verify=_VERIFY,
-            ),
-        ],
-        variants=[
-            Variant(name="alpha", model="model-a"),
-            Variant(name="beta", model="model-b", trials=2),
-        ],
-    )
diff --git a/tests/test_auth.py b/tests/test_auth.py
index 28935f4..c713fd5 100644
--- a/tests/test_auth.py
+++ b/tests/test_auth.py
@@ -1,4 +1,4 @@
-"""GitHub token resolution, preflight, and secret-redaction name selection.
+"""GitHub token resolution and preflight.
 
 These never call the network: the ``gh`` fallback is monkeypatched, and the token
 value must never be logged or persisted (only its source is surfaced).
@@ -14,9 +14,7 @@
     TokenResolution,
     preflight_github_token,
     resolve_github_token,
-    secret_env_names,
 )
-from copilot_experiments.models import Variant
 
 
 def test_resolve_prefers_env_in_precedence_order():
@@ -62,24 +60,3 @@ def test_describe_never_leaks_token_characters():
     described = res.describe()
     assert "super-secret-value" not in described
     assert "env:GH_TOKEN" in described
-
-
-def test_secret_env_names_always_covers_token_vars():
-    names = secret_env_names({}, byok_secrets=False)
-    assert {"COPILOT_GITHUB_TOKEN", "GH_TOKEN", "GITHUB_TOKEN"} <= set(names)
-    assert "COPILOT_PROVIDER_API_KEY" not in names
-
-
-def test_secret_env_names_includes_byok_and_custom_secret_keys():
-    names = secret_env_names({"MY_API_KEY": "x", "PLAIN": "y"}, byok_secrets=True)
-    assert "COPILOT_PROVIDER_API_KEY" in names
-    assert "COPILOT_PROVIDER_BEARER_TOKEN" in names
-    assert "MY_API_KEY" in names
-    assert "PLAIN" not in names
-
-
-def test_variant_secret_env_round_trip():
-    # A token slipped into Variant.env is both redacted on disk and flagged for copilot.
-    v = Variant(name="v", env={"SECRET_TOKEN": "abc"})
-    assert "SECRET_TOKEN" in secret_env_names(v.env, byok_secrets=False)
-    assert "abc" not in str(v.stored())
diff --git a/tests/test_economics.py b/tests/test_economics.py
index 9f345ec..18476d2 100644
--- a/tests/test_economics.py
+++ b/tests/test_economics.py
@@ -1,4 +1,4 @@
-"""Tests for token-economics extraction, AIU pricing, aggregation, and rendering.
+"""Tests for token-economics extraction, AIU pricing, and rendering.
 
 Everything here is offline: synthetic ``session.shutdown`` / ``session.compaction_complete`` /
 ``session.truncation`` fixtures stand in for what the real Copilot CLI writes to ``events.jsonl``.
@@ -12,15 +12,7 @@
 
 from copilot_experiments import pricing
 from copilot_experiments.analysis import analyze_events
-from copilot_experiments.models import (
-    Metrics,
-    TaskResult,
-    TrialResult,
-    Variant,
-    VariantResult,
-)
 from copilot_experiments.render import render_session_analysis
-from copilot_experiments.report import aggregate_variant, build_summary, summary_markdown
 from copilot_experiments.sessionlog import extract_economics, parse_metrics
 
 # Token counts chosen so the default rates price out to a round 1.215 AIU.
@@ -297,67 +289,6 @@ def test_analysis_keeps_shutdown_totals_when_otel_present():
     assert a.economics.aiu == 1.215
 
 
-# --------------------------------------------------------------------------- #
-# report aggregation
-# --------------------------------------------------------------------------- #
-def _variant_result(aius: list[float], successes: list[bool | None]) -> VariantResult:
-    trials = [
-        TrialResult(
-            trial_no=i,
-            session_id=f"s{i}",
-            exit_code=0,
-            duration_s=1.0,
-            success=successes[i],
-            metrics=Metrics(
-                aiu=aius[i],
-                total_tokens=int(aius[i] * 1000),
-                lines_added=10,
-                cache_read_tokens=8000,
-            ),
-        )
-        for i in range(len(aius))
-    ]
-    task = TaskResult(task_slug="task-001", task_name=None, prompt="p", trials=trials)
-    return VariantResult(variant=Variant(name="v", model="claude-opus-4.8"), tasks=[task])
-
-
-def test_aggregate_variant_variance_and_cost():
-    vr = _variant_result([1.0, 3.0], [True, False])
-    agg = aggregate_variant(vr)
-    assert agg["avg_aiu"] == 2.0
-    assert agg["std_aiu"] == 1.414
-    assert agg["cv_aiu"] == 0.707
-    assert agg["total_aiu"] == 4.0
-    # One solved task out of two -> all spend attributed to that single success.
-    assert agg["aiu_per_solve"] == 4.0
-    assert agg["avg_cache_read_tokens"] == 8000
-
-
-def test_aggregate_variant_single_trial_zero_std():
-    vr = _variant_result([2.5], [True])
-    agg = aggregate_variant(vr)
-    assert agg["std_aiu"] == 0.0
-    assert agg["cv_aiu"] is None
-
-
-def test_summary_markdown_has_cost_section():
-    vr = _variant_result([1.0, 3.0], [True, True])
-
-    class _Run:
-        run_id = "r1"
-        experiment_name = "Econ"
-        experiment_slug = "econ"
-        started_at = "2026-01-01T00:00:00Z"
-        finished_at = "2026-01-01T00:10:00Z"
-        status = "completed"
-        variants = [vr]
-
-    md = summary_markdown(build_summary(_Run()))
-    assert "Cost & token economics" in md
-    assert "AIU / solve" in md
-    assert "Total cost:" in md
-
-
 # --------------------------------------------------------------------------- #
 # rendering
 # --------------------------------------------------------------------------- #
diff --git a/tests/test_index.py b/tests/test_index.py
deleted file mode 100644
index 26a97f1..0000000
--- a/tests/test_index.py
+++ /dev/null
@@ -1,61 +0,0 @@
-"""Tests for the SQLite index reindex/list operations."""
-
-from __future__ import annotations
-
-from pathlib import Path
-
-from copilot_experiments import Experiment, run_experiment
-from copilot_experiments.index import connect, list_runs, reindex
-from copilot_experiments.invoker import MockInvoker
-from copilot_experiments.storage import Layout
-
-
-def test_reindex_rebuilds_from_filesystem(repo_root: Path, experiment: Experiment):
-    run = run_experiment(
-        experiment,
-        root=repo_root,
-        invoker=MockInvoker(),
-        session_state_root=repo_root / ".session-state",
-    )
-    layout = Layout(repo_root)
-
-    # Delete the DB and rebuild it purely from results/.
-    layout.index_db.unlink()
-    count = reindex(layout)
-    assert count == 1
-
-    rows = list_runs(layout)
-    assert any(r["run_id"] == run.run_id for r in rows)
-
-
-def test_index_persists_cost_columns(repo_root: Path, experiment: Experiment):
-    run_experiment(
-        experiment,
-        root=repo_root,
-        invoker=MockInvoker(),
-        session_state_root=repo_root / ".session-state",
-    )
-    layout = Layout(repo_root)
-    reindex(layout)
-
-    conn = connect(layout.index_db)
-    try:
-        cols = {r[1] for r in conn.execute("PRAGMA table_info(trials)")}
-        assert {
-            "aiu",
-            "cache_read_tokens",
-            "lines_added",
-            "peak_context_tokens",
-            "n_requests",
-            "n_compactions",
-        } <= cols
-        row = conn.execute(
-            "SELECT aiu, cache_read_tokens, total_tokens, n_requests, lines_added "
-            "FROM trials LIMIT 1"
-        ).fetchone()
-    finally:
-        conn.close()
-    # The MockInvoker emits a self-consistent session.shutdown (1.9275 AIU).
-    assert row["aiu"] == 1.9275
-    assert row["cache_read_tokens"] == 12_000
-    assert row["n_requests"] == 4
diff --git a/tests/test_invoker.py b/tests/test_invoker.py
deleted file mode 100644
index 58da768..0000000
--- a/tests/test_invoker.py
+++ /dev/null
@@ -1,216 +0,0 @@
-"""MockInvoker and argument/env translation tests.
-
-Proves the mock actually mutates the workspace and emits a parseable, multi-turn
-session log, and that a variant is translated into the right ``copilot`` flags
-and environment (including BYOK secrets that must never reach stored artifacts).
-"""
-
-from __future__ import annotations
-
-import os
-import sys
-from pathlib import Path
-
-from copilot_experiments.invoker import (
-    CopilotInvoker,
-    Invocation,
-    MockInvoker,
-    build_args,
-    build_env,
-)
-from copilot_experiments.models import ProviderConfig, Variant
-from copilot_experiments.sessionlog import events_path, load_events, parse_metrics
-
-
-def _inv(tmp_path: Path, variant: Variant, *, session_id: str = "sess-1") -> Invocation:
-    ws = tmp_path / "ws"
-    ws.mkdir(exist_ok=True)
-    return Invocation(
-        prompt="do the thing",
-        workspace=ws,
-        session_id=session_id,
-        variant=variant,
-        log_dir=tmp_path / "logs",
-        stdout_path=tmp_path / "stdout.jsonl",
-        session_state_root=tmp_path / "state",
-    )
-
-
-def test_mock_solver_mutates_workspace(tmp_path: Path):
-    seen: dict[str, Path] = {}
-
-    def solver(ws: Path) -> None:
-        seen["ws"] = ws
-        (ws / "SOLVED").write_text("yes\n", encoding="utf-8")
-
-    inv = _inv(tmp_path, Variant(name="v"))
-    MockInvoker(solver=solver).run(inv)
-
-    assert (inv.workspace / "SOLVED").read_text(encoding="utf-8") == "yes\n"
-    assert seen["ws"] == inv.workspace
-    # With a solver, the default note is not written.
-    assert not (inv.workspace / "MOCK_RUN.md").exists()
-
-
-def test_mock_leaves_note_by_default(tmp_path: Path):
-    inv = _inv(tmp_path, Variant(name="v"))
-    MockInvoker().run(inv)
-    assert (inv.workspace / "MOCK_RUN.md").exists()
-
-
-def test_mock_writes_parseable_multiturn_log(tmp_path: Path):
-    inv = _inv(tmp_path, Variant(name="v", reasoning_effort="high"))
-    result = MockInvoker(turns=4).run(inv)
-    assert result.exit_code == 0
-
-    ev_path = events_path(inv.session_id, inv.session_state_root)
-    assert ev_path.exists()
-    # The same stream is also mirrored to stdout for the trial record.
-    assert inv.stdout_path.exists()
-
-    metrics = parse_metrics(load_events(ev_path))
-    assert metrics.n_turns >= 4
-    assert metrics.n_tool_calls >= 1
-    assert metrics.n_tool_failures >= 1  # the deliberate powershell failure + recovery
-    assert metrics.output_tokens and metrics.output_tokens > 0
-
-
-def test_mock_nonzero_exit_is_reported(tmp_path: Path):
-    inv = _inv(tmp_path, Variant(name="v"))
-    result = MockInvoker(exit_code=2).run(inv)
-    assert result.exit_code == 2
-
-
-def test_build_args_translates_variant_flags():
-    variant = Variant(
-        name="v",
-        model="gpt-x",
-        reasoning_effort="high",
-        agent="my-agent",
-        mode="autopilot",
-        allow_tools=["shell"],
-        deny_tools=["web"],
-    )
-    inv = Invocation(
-        prompt="P",
-        workspace=Path("."),
-        session_id="s",
-        variant=variant,
-        log_dir=Path("l"),
-        stdout_path=Path("o"),
-        session_state_root=Path("st"),
-    )
-    args = build_args(inv)
-
-    assert args[:2] == ["-p", "P"]
-    for flag, value in [
-        ("--model", "gpt-x"),
-        ("--effort", "high"),
-        ("--agent", "my-agent"),
-        ("--mode", "autopilot"),
-    ]:
-        assert flag in args
-        assert args[args.index(flag) + 1] == value
-    assert "--allow-all-tools" in args  # default
-    assert args.count("--allow-tool") == 1
-    assert args.count("--deny-tool") == 1
-
-
-def test_build_args_emits_share_and_secret_env_vars(tmp_path: Path):
-    inv = Invocation(
-        prompt="P",
-        workspace=tmp_path / "ws",
-        session_id="s",
-        variant=Variant(name="v"),
-        log_dir=tmp_path / "logs",
-        stdout_path=tmp_path / "stdout.txt",
-        session_state_root=tmp_path / "state",
-        share_path=tmp_path / "session.md",
-        secret_env_names=["COPILOT_GITHUB_TOKEN", "GH_TOKEN"],
-    )
-    args = build_args(inv)
-
-    share = next(a for a in args if a.startswith("--share="))
-    assert Path(share.split("=", 1)[1]) == (tmp_path / "session.md").resolve()
-
-    secret = next(a for a in args if a.startswith("--secret-env-vars="))
-    assert secret.split("=", 1)[1] == "COPILOT_GITHUB_TOKEN,GH_TOKEN"
-
-
-def test_build_args_omits_share_and_secrets_when_unset(tmp_path: Path):
-    inv = Invocation(
-        prompt="P",
-        workspace=tmp_path / "ws",
-        session_id="s",
-        variant=Variant(name="v"),
-        log_dir=tmp_path / "logs",
-        stdout_path=tmp_path / "stdout.txt",
-        session_state_root=tmp_path / "state",
-    )
-    args = build_args(inv)
-    assert not any(a.startswith("--share") for a in args)
-    assert not any(a.startswith("--secret-env-vars") for a in args)
-
-
-def test_build_args_uses_absolute_workspace_and_log_dir(tmp_path: Path):
-    # Regression: a *relative* ``-C`` was resolved against the process cwd (already
-    # the workspace) and doubled -> ENAMETOOLONG -> Copilot no-op. ``-C`` and
-    # ``--log-dir`` must always be absolute. See ADR-0009.
-    ws = tmp_path / "ws"
-    ws.mkdir()
-    inv = Invocation(
-        prompt="P",
-        workspace=Path("ws"),  # deliberately relative
-        session_id="s",
-        variant=Variant(name="v"),
-        log_dir=Path("logs"),  # deliberately relative
-        stdout_path=tmp_path / "stdout.jsonl",
-        session_state_root=tmp_path / "state",
-    )
-    args = build_args(inv)
-
-    c_path = Path(args[args.index("-C") + 1])
-    log_path = Path(args[args.index("--log-dir") + 1])
-    assert c_path.is_absolute()
-    assert log_path.is_absolute()
-    assert c_path == Path("ws").resolve()
-    assert log_path == Path("logs").resolve()
-
-
-def test_streaming_invoker_tees_lines_to_sink_and_file(tmp_path: Path):
-    # ``--verbose`` relies on the streaming path forwarding every Copilot output
-    # line to the sink *and* still capturing it to stdout.jsonl. Drive it with a
-    # trivial Python subprocess so the test stays offline and cross-platform.
-    collected: list[str] = []
-    invoker = CopilotInvoker(binary=sys.executable, stream=collected.append)
-    stdout_path = tmp_path / "stdout.jsonl"
-    script = "import sys\nfor i in range(3): print('line', i)\nsys.exit(0)\n"
-    code = invoker._run_streaming(
-        [sys.executable, "-c", script], str(tmp_path), dict(os.environ), stdout_path
-    )
-
-    assert code == 0
-    assert collected == ["line 0", "line 1", "line 2"]
-    captured = stdout_path.read_text(encoding="utf-8").splitlines()
-    assert captured == ["line 0", "line 1", "line 2"]
-
-
-def test_build_env_injects_provider_but_storage_redacts():
-    provider = ProviderConfig(base_url="http://localhost:11434/v1", api_key="SECRET-KEY")
-    variant = Variant(name="v", provider=provider, env={"FOO": "bar"})
-    inv = Invocation(
-        prompt="P",
-        workspace=Path("."),
-        session_id="s",
-        variant=variant,
-        log_dir=Path("l"),
-        stdout_path=Path("o"),
-        session_state_root=Path("st"),
-    )
-
-    env = build_env(inv)
-    assert env["FOO"] == "bar"
-    assert env["COPILOT_PROVIDER_API_KEY"] == "SECRET-KEY"
-
-    # The secret must never appear in what gets written to disk.
-    assert "SECRET-KEY" not in str(variant.stored())
diff --git a/tests/test_models.py b/tests/test_models.py
deleted file mode 100644
index d6392b0..0000000
--- a/tests/test_models.py
+++ /dev/null
@@ -1,60 +0,0 @@
-"""Tests for the pydantic models and their helpers."""
-
-from __future__ import annotations
-
-from copilot_experiments import ProviderConfig, Variant
-
-
-def test_provider_to_env_maps_fields():
-    provider = ProviderConfig(
-        base_url="http://localhost:11434/v1",
-        type="openai",
-        api_key="secret-key",
-        model_id="llama3.1",
-    )
-    env = provider.to_env()
-    assert env["COPILOT_PROVIDER_BASE_URL"] == "http://localhost:11434/v1"
-    assert env["COPILOT_PROVIDER_TYPE"] == "openai"
-    assert env["COPILOT_PROVIDER_API_KEY"] == "secret-key"
-    assert env["COPILOT_PROVIDER_MODEL_ID"] == "llama3.1"
-
-
-def test_provider_redacted_masks_secrets():
-    provider = ProviderConfig(base_url="http://x", api_key="secret", bearer_token="tok")
-    redacted = provider.redacted()
-    assert redacted["api_key"] == "***redacted***"
-    assert redacted["bearer_token"] == "***redacted***"
-    assert redacted["base_url"] == "http://x"
-
-
-def test_variant_slug():
-    assert Variant(name="Opus Medium").slug == "opus-medium"
-
-
-def test_variant_stored_redacts_provider_secret():
-    variant = Variant(
-        name="local",
-        provider=ProviderConfig(base_url="http://x", api_key="secret"),
-    )
-    stored = variant.stored()
-    assert stored["provider"]["api_key"] == "***redacted***"
-
-
-def test_variant_stored_redacts_secret_like_env_values():
-    # The free-form env escape hatch must not leak a token into variant.json.
-    variant = Variant(
-        name="byok-via-env",
-        env={
-            "COPILOT_PROVIDER_API_KEY": "sk-live-123",
-            "GITHUB_TOKEN": "ghp_secret",
-            "HTTP_AUTHORIZATION": "Bearer abc",
-            "MY_PASSWORD": "hunter2",
-            "LOG_LEVEL": "debug",  # benign -> preserved
-        },
-    )
-    env = variant.stored()["env"]
-    assert env["COPILOT_PROVIDER_API_KEY"] == "***redacted***"
-    assert env["GITHUB_TOKEN"] == "***redacted***"
-    assert env["HTTP_AUTHORIZATION"] == "***redacted***"
-    assert env["MY_PASSWORD"] == "***redacted***"
-    assert env["LOG_LEVEL"] == "debug"
diff --git a/tests/test_pier_backend.py b/tests/test_pier_backend.py
index 4623ddc..2ddea00 100644
--- a/tests/test_pier_backend.py
+++ b/tests/test_pier_backend.py
@@ -139,22 +139,6 @@ def test_prepare_pier_job_for_run_resume_uses_latest_nested_run(tmp_path: Path):
     assert prepared.resumed
 
 
-def test_prepare_pier_job_for_run_resume_supports_legacy_flat_job(tmp_path: Path):
-    config_path = tmp_path / "job.yaml"
-    config_path.write_text("job_name: smoke\njobs_dir: jobs\n", encoding="utf-8")
-    config = load_pier_job_config(config_path, root=tmp_path)
-    legacy_job = tmp_path / "jobs" / "smoke"
-    legacy_job.mkdir(parents=True)
-    (legacy_job / "config.json").write_text("{}", encoding="utf-8")
-
-    prepared = prepare_pier_job_for_run(config, resume=True)
-
-    assert prepared.run_name == "smoke"
-    assert prepared.config.jobs_dir == tmp_path / "jobs"
-    assert prepared.config.job_name == "smoke"
-    assert prepared.resumed
-
-
 def test_preflight_pier_backend_reports_missing_docker(
     tmp_path: Path, monkeypatch: pytest.MonkeyPatch
 ):
@@ -229,7 +213,8 @@ def fail_if_called():
     result = CliRunner().invoke(app, ["run", "--root", str(tmp_path)])
 
     assert result.exit_code == 1
-    assert "Pier backend preflight failed" in result.output
+    assert "Validation" in result.output
+    assert "smoke: backend" in result.output
     assert "Docker is unavailable" in result.output
 
 
diff --git a/tests/test_pier_results.py b/tests/test_pier_results.py
index bd30cdb..bf2575b 100644
--- a/tests/test_pier_results.py
+++ b/tests/test_pier_results.py
@@ -8,7 +8,6 @@
 from typer.testing import CliRunner
 
 from copilot_experiments.cli import app
-from copilot_experiments.index import connect, index_pier_job_dir
 from copilot_experiments.pier_results import (
     build_pier_summary,
     describe_missing_pier_analysis_source,
@@ -210,12 +209,12 @@ def test_build_pier_summary_reads_native_copilot_events(tmp_path: Path):
     assert summary["run_id"] == "demo-job"
     assert summary["status"] == "completed"
     assert summary["overall_success_rate"] == 1.0
-    variant = summary["variants"][0]
-    assert variant["variant"] == "copilot-cli-gpt-5-mini"
-    assert variant["avg_turns"] == 1.0
-    assert variant["avg_tool_calls"] == 1.0
-    assert variant["avg_total_tokens"] == 15.0
-    assert variant["tasks"][0]["task"] == "textstats"
+    agent = summary["agents"][0]
+    assert agent["agent"] == "copilot-cli-gpt-5-mini"
+    assert agent["avg_turns"] == 1.0
+    assert agent["avg_tool_calls"] == 1.0
+    assert agent["avg_total_tokens"] == 15.0
+    assert agent["tasks"][0]["task"] == "textstats"
 
 
 def test_build_pier_summary_reads_nested_run_identity(tmp_path: Path):
@@ -224,8 +223,8 @@ def test_build_pier_summary_reads_nested_run_identity(tmp_path: Path):
 
     summary = build_pier_summary(job_dir)
 
-    assert summary["experiment"] == "demo-job"
-    assert summary["experiment_slug"] == "demo-job"
+    assert summary["job"] == "demo-job"
+    assert summary["job_name"] == "demo-job"
     assert summary["run_id"] == "20260620-153000"
     assert summary["pier_job_id"] == "demo-job/20260620-153000"
     assert pier_job_identity(job_dir) == {
@@ -236,13 +235,14 @@ def test_build_pier_summary_reads_nested_run_identity(tmp_path: Path):
 
 
 def test_resolve_pier_trial_events(tmp_path: Path):
-    job_dir = _make_pier_job(tmp_path / "jobs" / "demo-job")
+    job_dir = _make_pier_job(tmp_path / "jobs" / "demo-job" / "20260620-153000")
+    write_pier_run_manifest(job_dir, job_name="demo-job", run_id="20260620-153000")
 
     events_path, label = resolve_pier_trial_events(job_dir)
 
     assert events_path is not None
     assert events_path.name == "events.jsonl"
-    assert label == "demo-job · copilot-cli__textstats__1"
+    assert label == "demo-job/20260620-153000 · copilot-cli__textstats__1"
 
 
 def test_build_pier_summary_reads_trajectory_when_native_events_are_absent(tmp_path: Path):
@@ -250,10 +250,10 @@ def test_build_pier_summary_reads_trajectory_when_native_events_are_absent(tmp_p
 
     summary = build_pier_summary(job_dir)
 
-    variant = summary["variants"][0]
-    assert variant["avg_turns"] == 1.0
-    assert variant["avg_tool_calls"] == 1.0
-    assert variant["avg_output_tokens"] == 7.0
+    agent = summary["agents"][0]
+    assert agent["avg_turns"] == 1.0
+    assert agent["avg_tool_calls"] == 1.0
+    assert agent["avg_output_tokens"] == 7.0
 
 
 def test_describe_missing_pier_analysis_source_explains_harness_error(tmp_path: Path):
@@ -268,12 +268,26 @@ def test_describe_missing_pier_analysis_source_explains_harness_error(tmp_path:
 
 
 def test_cli_analyze_reads_pier_job_events(tmp_path: Path):
-    _make_pier_job(tmp_path / "jobs" / "demo-job")
+    job_dir = _make_pier_job(tmp_path / "jobs" / "demo-job" / "20260620-153000")
+    write_pier_run_manifest(job_dir, job_name="demo-job", run_id="20260620-153000")
     runner = CliRunner()
 
     result = runner.invoke(
         app,
-        ["analyze", "demo-job", "--root", str(tmp_path), "--trial", "1", "--max-turns", "5"],
+        [
+            "analyze",
+            "demo-job",
+            "--root",
+            str(tmp_path),
+            "--agent",
+            "copilot-cli",
+            "--task",
+            "textstats",
+            "--trial",
+            "1",
+            "--max-turns",
+            "5",
+        ],
     )
 
     assert result.exit_code == 0, result.output
@@ -283,12 +297,26 @@ def test_cli_analyze_reads_pier_job_events(tmp_path: Path):
 
 
 def test_cli_analyze_reads_pier_job_trajectory_when_events_are_absent(tmp_path: Path):
-    _make_pier_job_with_trajectory(tmp_path / "jobs" / "demo-job")
+    job_dir = _make_pier_job_with_trajectory(tmp_path / "jobs" / "demo-job" / "20260620-153000")
+    write_pier_run_manifest(job_dir, job_name="demo-job", run_id="20260620-153000")
     runner = CliRunner()
 
     result = runner.invoke(
         app,
-        ["analyze", "demo-job", "--root", str(tmp_path), "--trial", "1", "--max-turns", "5"],
+        [
+            "analyze",
+            "demo-job",
+            "--root",
+            str(tmp_path),
+            "--agent",
+            "copilot-cli",
+            "--task",
+            "textstats",
+            "--trial",
+            "1",
+            "--max-turns",
+            "5",
+        ],
     )
 
     assert result.exit_code == 0, result.output
@@ -298,12 +326,24 @@ def test_cli_analyze_reads_pier_job_trajectory_when_events_are_absent(tmp_path:
 
 
 def test_cli_analyze_reports_pier_harness_error_when_logs_are_absent(tmp_path: Path):
-    _make_pier_job_with_harness_error(tmp_path / "jobs" / "demo-job")
+    job_dir = _make_pier_job_with_harness_error(tmp_path / "jobs" / "demo-job" / "20260620-153000")
+    write_pier_run_manifest(job_dir, job_name="demo-job", run_id="20260620-153000")
     runner = CliRunner()
 
     result = runner.invoke(
         app,
-        ["analyze", "demo-job", "--root", str(tmp_path), "--trial", "1"],
+        [
+            "analyze",
+            "demo-job",
+            "--root",
+            str(tmp_path),
+            "--agent",
+            "copilot-cli",
+            "--task",
+            "textstats",
+            "--trial",
+            "1",
+        ],
     )
 
     assert result.exit_code == 1
@@ -329,6 +369,40 @@ def test_cli_list_displays_pier_run_selectors(tmp_path: Path):
     assert "No runs yet" not in result.output
 
 
+def test_cli_validate_checks_pier_config(
+    tmp_path: Path,
+    monkeypatch,
+):
+    experiments = tmp_path / "experiments"
+    task = tmp_path / "tasks" / "one"
+    experiments.mkdir()
+    task.mkdir(parents=True)
+    (experiments / "job.yaml").write_text(
+        "\n".join(
+            [
+                "job_name: demo-job",
+                "jobs_dir: jobs",
+                "agents:",
+                "  - name: copilot-cli",
+                "    model_name: gpt-5-mini",
+                "tasks:",
+                "  - path: ../tasks/one",
+            ]
+        ),
+        encoding="utf-8",
+    )
+    monkeypatch.setattr("copilot_experiments.auth._gh_auth_token", lambda: "token")
+    runner = CliRunner()
+
+    result = runner.invoke(app, ["validate", "--root", str(tmp_path)])
+
+    assert result.exit_code == 0, result.output
+    assert "Pier job configs" in result.output
+    assert "Validation" in result.output
+    assert "demo-job: agents" in result.output
+    assert "auth" in result.output
+
+
 def test_cli_show_accepts_pier_job_run_selector(tmp_path: Path):
     job_dir = _make_pier_job(tmp_path / "jobs" / "demo-job" / "20260620-153000")
     write_pier_run_manifest(job_dir, job_name="demo-job", run_id="20260620-153000")
@@ -351,12 +425,12 @@ def test_cli_show_missing_run_points_to_list(tmp_path: Path):
     result = runner.invoke(app, ["show", "missing", "--root", str(tmp_path)])
 
     assert result.exit_code == 1
-    assert "Run not found" in result.output
+    assert "Pier run not found" in result.output
     assert "copilot-experiments list" in result.output
     assert "job-name/run-id" in result.output
 
 
-def test_write_pier_summary_and_index(tmp_path: Path):
+def test_write_pier_summary(tmp_path: Path):
     job_dir = _make_pier_job(tmp_path / "jobs" / "demo-job" / "20260620-153000")
     write_pier_run_manifest(job_dir, job_name="demo-job", run_id="20260620-153000")
 
@@ -365,22 +439,6 @@ def test_write_pier_summary_and_index(tmp_path: Path):
     assert (job_dir / "summary.json").exists()
     assert (job_dir / "summary.md").exists()
     assert summary["n_trials"] == 1
-
-    conn = connect(tmp_path / "results" / "index.db")
-    try:
-        index_pier_job_dir(conn, job_dir)
-        job = conn.execute("SELECT * FROM pier_jobs WHERE id='demo-job/20260620-153000'").fetchone()
-        trial = conn.execute(
-            "SELECT * FROM pier_trials WHERE job_id='demo-job/20260620-153000'"
-        ).fetchone()
-    finally:
-        conn.close()
-
-    assert job["job_name"] == "demo-job"
-    assert job["run_id"] == "20260620-153000"
-    assert job["success_rate"] == 1.0
-    assert trial["job_name"] == "demo-job"
-    assert trial["run_id"] == "20260620-153000"
-    assert trial["trial_name"] == "copilot-cli__textstats__1"
-    assert trial["success"] == 1
-    assert trial["total_tokens"] == 15.0
+    assert summary["n_agents"] == 1
+    assert summary["agents"][0]["name"] == "copilot-cli-gpt-5-mini"
+    assert "Agent" in (job_dir / "summary.md").read_text(encoding="utf-8")
diff --git a/tests/test_runner.py b/tests/test_runner.py
deleted file mode 100644
index d3b0f86..0000000
--- a/tests/test_runner.py
+++ /dev/null
@@ -1,301 +0,0 @@
-"""End-to-end runner tests using the MockInvoker (no real Copilot needed)."""
-
-from __future__ import annotations
-
-import json
-import sqlite3
-from pathlib import Path
-
-from copilot_experiments import Experiment, Task, Variant, dry_run_experiment, run_experiment
-from copilot_experiments.invoker import MockInvoker
-from copilot_experiments.models import ExperimentRun, TaskResult, TrialResult, VariantResult
-from copilot_experiments.storage import Layout
-
-
-def solve(workspace: Path) -> None:
-    """A MockInvoker solver that completes the sample task."""
-    (workspace / "SOLVED").write_text("done\n", encoding="utf-8")
-
-
-def _mock_run(experiment: Experiment, repo_root: Path, **kwargs):
-    """Run the experiment with the mock invoker, persisting artifacts under repo_root."""
-    return run_experiment(
-        experiment,
-        root=repo_root,
-        invoker=MockInvoker(**kwargs),
-        session_state_root=repo_root / ".session-state",
-    )
-
-
-def test_run_experiment_produces_artifacts(repo_root: Path, experiment: Experiment):
-    run = _mock_run(experiment, repo_root)
-
-    layout = Layout(repo_root)
-    run_dir = layout.run_dir(experiment.slug, run.run_id)
-    assert (run_dir / "run.json").exists()
-    assert (run_dir / "summary.json").exists()
-    assert (run_dir / "summary.md").exists()
-
-    # alpha has 1 trial, beta has 2 -> 3 trial dirs total (single task suite).
-    trial_dirs = list((run_dir / "variants").glob("*/tasks/*/trials/*"))
-    assert len(trial_dirs) == 3
-    for td in trial_dirs:
-        assert (td / "meta.json").exists()
-        assert (td / "metrics.json").exists()
-        assert (td / "analysis.json").exists()
-        assert (td / "events.jsonl").exists()
-        assert (td / "copilot-otel.jsonl").exists()
-        assert (td / "stdout.txt").exists()
-        assert (td / "prompt.md").exists()
-        analysis = json.loads((td / "analysis.json").read_text(encoding="utf-8"))
-        assert analysis["llm_calls"]
-        # Copilot's bulky --log-dir debug log must never be persisted under results/.
-        assert not (td / "logs").exists()
-
-
-def test_run_experiment_without_solver_fails_verify(repo_root: Path, experiment: Experiment):
-    run = _mock_run(experiment, repo_root)
-    successes = [t.success for vr in run.variants for t in vr.all_trials]
-    assert all(s is False for s in successes)
-
-
-def test_run_experiment_with_solver_succeeds(repo_root: Path, experiment: Experiment):
-    run = _mock_run(experiment, repo_root, solver=solve)
-    successes = [t.success for vr in run.variants for t in vr.all_trials]
-    assert all(s is True for s in successes)
-
-    # The on-disk artifacts must corroborate success end-to-end.
-    layout = Layout(repo_root)
-    run_dir = layout.run_dir(experiment.slug, run.run_id)
-    trial = run_dir / "variants" / "alpha" / "tasks" / "task-001" / "trials" / "001"
-
-    diff = (trial / "workspace.diff").read_text(encoding="utf-8")
-    assert "SOLVED" in diff and diff.strip() != ""
-
-    verify = json.loads((trial / "verify.json").read_text(encoding="utf-8"))
-    assert verify["success"] is True and verify["exit_code"] == 0
-
-    meta = json.loads((trial / "meta.json").read_text(encoding="utf-8"))
-    assert meta["success"] is True
-
-
-def test_run_experiment_forwards_progress_per_trial(repo_root: Path, experiment: Experiment):
-    msgs: list[str] = []
-    _mock_run(experiment, repo_root, solver=solve)  # warm-up run (no progress)
-    msgs.clear()
-    run_experiment(
-        experiment,
-        root=repo_root,
-        invoker=MockInvoker(solver=solve),
-        session_state_root=repo_root / ".session-state",
-        progress=msgs.append,
-    )
-
-    # Per-variant header plus a distinct, tagged set of phase lines per trial.
-    assert any(m.startswith("variant beta: 2 trial(s)") for m in msgs)
-    for tag in ("alpha/task-001/001", "beta/task-001/001", "beta/task-001/002"):
-        assert any(m.startswith(f"[{tag}] invoking copilot") for m in msgs)
-        assert any(m.startswith(f"[{tag}] session log:") for m in msgs)
-        assert any(m.startswith(f"[{tag}] verify:") for m in msgs)
-
-
-def test_run_experiment_populates_index(repo_root: Path, experiment: Experiment):
-    run = _mock_run(experiment, repo_root)
-    layout = Layout(repo_root)
-    conn = sqlite3.connect(str(layout.index_db))
-    try:
-        runs = conn.execute("SELECT run_id FROM runs").fetchall()
-        variants = conn.execute("SELECT variant_slug FROM variants").fetchall()
-        trials = conn.execute("SELECT trial_no, task_slug FROM trials").fetchall()
-    finally:
-        conn.close()
-    assert [r[0] for r in runs] == [run.run_id]
-    assert {v[0] for v in variants} == {"alpha", "beta"}
-    assert len(trials) == 3
-    # Single-task sugar still produces exactly one task slug across all trials.
-    assert {t[1] for t in trials} == {"task-001"}
-
-
-def test_run_multitask_experiment(repo_root: Path, multitask_experiment: Experiment):
-    run = _mock_run(multitask_experiment, repo_root, solver=solve)
-    layout = Layout(repo_root)
-    run_dir = layout.run_dir(multitask_experiment.slug, run.run_id)
-
-    # Per-task dirs exist for every variant: 2 variants x 2 tasks = 4 task dirs.
-    task_dirs = sorted(p.name for p in (run_dir / "variants").glob("*/tasks/*"))
-    assert task_dirs == ["first-task", "first-task", "second-task", "second-task"]
-
-    # Each variant result nests two tasks; suite measures reflect all-pass solver.
-    for vr in run.variants:
-        assert len(vr.tasks) == 2
-        assert vr.mean_resolved_rate == 1.0
-        assert vr.resolved_at_k_rate == 1.0
-
-    # Summary records the task axis and the two suite measures side by side.
-    summary = json.loads((run_dir / "summary.json").read_text(encoding="utf-8"))
-    assert summary["n_tasks"] == 2
-    for v in summary["variants"]:
-        assert v["n_tasks"] == 2
-        assert v["mean_resolved_rate"] == 1.0
-        assert v["resolved_at_k_rate"] == 1.0
-        assert {t["task"] for t in v["tasks"]} == {"first-task", "second-task"}
-
-    # Index carries the task dimension.
-    conn = sqlite3.connect(str(layout.index_db))
-    try:
-        tasks = conn.execute("SELECT DISTINCT task_slug FROM tasks").fetchall()
-        trials = conn.execute("SELECT task_slug FROM trials").fetchall()
-    finally:
-        conn.close()
-    assert {t[0] for t in tasks} == {"first-task", "second-task"}
-    # alpha: 2 tasks x 1 trial + beta: 2 tasks x 2 trials = 6 trials.
-    assert len(trials) == 6
-
-
-def test_dry_run_validates_and_leaves_nothing_behind(repo_root: Path, experiment: Experiment):
-    report = dry_run_experiment(experiment, root=repo_root)
-
-    # Every plumbing stage reports OK...
-    assert report.ok, [(c.name, c.detail) for c in report.checks if not c.ok]
-    assert {c.name for c in report.checks} >= {
-        "workspace provisioned",
-        "session log captured",
-        "otel captured",
-        "metrics parsed",
-        "analysis written",
-        "workspace diff captured",
-        "verify ran",
-        "run summary written",
-        "indexed",
-    }
-
-    # ...and absolutely nothing is persisted under the repo root.
-    assert not (repo_root / "results").exists()
-    assert not (repo_root / ".session-state").exists()
-
-
-def test_dry_run_flags_broken_plumbing(repo_root: Path, experiment: Experiment):
-    # An invoker that leaves the workspace untouched (no note, no solver) yields an
-    # empty diff -- exactly the failure mode the MAX_PATH bug produced.
-    report = dry_run_experiment(experiment, root=repo_root, invoker=MockInvoker(leave_note=False))
-
-    assert report.ok is False
-    diff_check = next(c for c in report.checks if c.name == "workspace diff captured")
-    assert diff_check.ok is False
-    # Still leaves nothing behind, even on failure.
-    assert not (repo_root / "results").exists()
-
-
-# --------------------------------------------------------------------------- #
-# Harness vs. experiment failures: status enum + roll-up
-# --------------------------------------------------------------------------- #
-def test_clean_run_marks_trials_ok_and_run_completed(repo_root: Path, experiment: Experiment):
-    run = _mock_run(experiment, repo_root, solver=solve)
-    assert run.status == "completed"
-    assert all(t.status == "ok" for t in run.all_trials)
-    assert run.n_failed_trials == 0
-
-
-def test_copilot_nonzero_exit_is_a_harness_failure(repo_root: Path, experiment: Experiment):
-    # Copilot was invoked but exited non-zero (the auth-failure scenario): every trial
-    # is flagged ``copilot_failed`` and the run rolls up to ``failed`` -- not a clean
-    # "0% success" that hides a broken harness.
-    run = _mock_run(experiment, repo_root, exit_code=1)
-    assert run.status == "failed"
-    assert all(t.status == "copilot_failed" for t in run.all_trials)
-    for vr in run.variants:
-        for t in vr.all_trials:
-            assert t.error and "exited 1" in t.error
-            assert t.error_artifact == "stdout.txt"
-
-    # The status is durable on disk.
-    layout = Layout(repo_root)
-    meta_path = layout.trial_dir(experiment.slug, run.run_id, "alpha", "task-001", 1) / "meta.json"
-    meta = json.loads(meta_path.read_text(encoding="utf-8"))
-    assert meta["status"] == "copilot_failed"
-
-
-def test_harness_error_on_provision_failure_still_records_trial(repo_root: Path):
-    # A missing fixture makes provisioning raise: that is a harness error, the run
-    # continues, and a trial record (status=harness_error) is still written.
-    broken = Experiment(
-        name="Broken",
-        task=Task(prompt="x", fixture="fixtures/does_not_exist"),
-        variants=[Variant(name="alpha")],
-    )
-    run = _mock_run(broken, repo_root)
-    assert run.status == "failed"
-    trial = run.all_trials[0]
-    assert trial.status == "harness_error"
-    assert "WorkspaceError" in (trial.error or "")
-
-    layout = Layout(repo_root)
-    meta = layout.trial_dir(broken.slug, run.run_id, "alpha", "task-001", 1) / "meta.json"
-    assert json.loads(meta.read_text(encoding="utf-8"))["status"] == "harness_error"
-
-
-def test_partial_run_when_some_variants_fail(repo_root: Path):
-    # One variant errors in the harness while the others run cleanly -> ``partial``.
-    failing = Variant(name="alpha")
-    failing_solver_run = ExperimentRun(
-        run_id="r",
-        experiment_slug="s",
-        experiment_name="n",
-        started_at="t",
-        variants=[
-            VariantResult(
-                variant=failing,
-                tasks=[
-                    TaskResult(
-                        task_slug="task-001",
-                        trials=[
-                            TrialResult(
-                                trial_no=1, session_id="a", exit_code=0, duration_s=1.0, status="ok"
-                            ),
-                            TrialResult(
-                                trial_no=2,
-                                session_id="b",
-                                exit_code=1,
-                                duration_s=1.0,
-                                status="copilot_failed",
-                            ),
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-    assert failing_solver_run.rollup_status() == "partial"
-    assert failing_solver_run.n_failed_trials == 1
-
-
-def test_token_injected_into_env_and_flagged_secret(repo_root: Path, experiment: Experiment):
-    # The resolved token reaches each trial's env_overrides and the variable carrying
-    # it is added to copilot's --secret-env-vars, but never to a stored artifact.
-    seen: list = []
-
-    class RecordingInvoker(MockInvoker):
-        def run(self, inv):  # noqa: ANN001 - test double
-            seen.append(inv)
-            return super().run(inv)
-
-    run = run_experiment(
-        experiment,
-        root=repo_root,
-        invoker=RecordingInvoker(solver=solve),
-        session_state_root=repo_root / ".session-state",
-        github_token="secret-token-123",
-    )
-    assert run.status == "completed"
-    assert seen, "invoker was never called"
-    for inv in seen:
-        assert inv.env_overrides.get("COPILOT_GITHUB_TOKEN") == "secret-token-123"
-        assert "COPILOT_GITHUB_TOKEN" in inv.secret_env_names
-        assert inv.share_path is not None and inv.share_path.name == "session.md"
-
-    # The token must not have leaked into any persisted artifact.
-    layout = Layout(repo_root)
-    run_dir = layout.run_dir(experiment.slug, run.run_id)
-    for path in run_dir.rglob("*"):
-        if path.is_file():
-            assert "secret-token-123" not in path.read_text(encoding="utf-8", errors="ignore")
diff --git a/tests/test_storage.py b/tests/test_storage.py
index 3cb3c6a..f707620 100644
--- a/tests/test_storage.py
+++ b/tests/test_storage.py
@@ -1,4 +1,4 @@
-"""Tests for the filesystem Layout helpers."""
+"""Tests for the Pier-only filesystem layout helpers."""
 
 from __future__ import annotations
 
@@ -8,74 +8,42 @@
 from copilot_experiments.storage import Layout
 
 
-def _make_run(root: Path, exp: str, run_id: str) -> Path:
-    rd = root / "results" / exp / run_id
-    rd.mkdir(parents=True)
-    (rd / "run.json").write_text(json.dumps({"run_id": run_id}), encoding="utf-8")
-    return rd
+def _make_pier_run(root: Path, job: str, run_id: str) -> Path:
+    run_dir = root / "jobs" / job / run_id
+    run_dir.mkdir(parents=True)
+    (run_dir / "config.json").write_text("{}", encoding="utf-8")
+    (run_dir / "result.json").write_text("{}", encoding="utf-8")
+    (run_dir / "copilot-experiments-run.json").write_text(
+        json.dumps({"job_name": job, "run_id": run_id, "id": f"{job}/{run_id}"}),
+        encoding="utf-8",
+    )
+    return run_dir
 
 
 def test_layout_paths(tmp_path: Path):
     layout = Layout(tmp_path)
-    assert layout.results_dir == tmp_path / "results"
-    assert layout.index_db == tmp_path / "results" / "index.db"
-    trial = layout.trial_dir("exp", "run1", "v1", "task-001", 3)
-    assert trial.name == "003"
-    assert trial.parent.name == "trials"
-    assert trial.parent.parent.name == "task-001"
-    assert trial.parent.parent.parent.name == "tasks"
-    assert trial.parent.parent.parent.parent.name == "v1"
+    assert layout.jobs_dir == tmp_path / "jobs"
+    assert layout.experiments_dir == tmp_path / "experiments"
 
 
-def test_find_and_latest_run(tmp_path: Path):
-    _make_run(tmp_path, "exp", "20260101T000000Z_aaa111")
-    rd2 = _make_run(tmp_path, "exp", "20260102T000000Z_bbb222")
-    layout = Layout(tmp_path)
-
-    assert layout.latest_run() == rd2
-    assert layout.find_run("20260102T000000Z_bbb222") == rd2
-    # Unique prefix resolves.
-    assert layout.find_run("20260101") is not None
-    # Unknown id returns None.
-    assert layout.find_run("nope") is None
-
-
-def test_iter_runs_skips_incomplete(tmp_path: Path):
-    _make_run(tmp_path, "exp", "good")
-    (tmp_path / "results" / "exp" / "incomplete").mkdir(parents=True)
-    layout = Layout(tmp_path)
-    ids = [rid for _, rid, _ in layout.iter_runs()]
-    assert ids == ["good"]
-
-
-def test_pier_job_helpers(tmp_path: Path):
-    jobs = tmp_path / "jobs"
-    good = jobs / "smoke" / "20260102-000000"
-    good.mkdir(parents=True)
-    (good / "config.json").write_text("{}", encoding="utf-8")
-    (good / "result.json").write_text("{}", encoding="utf-8")
-    latest = jobs / "smoke" / "20260103-000000"
-    latest.mkdir()
-    (latest / "config.json").write_text("{}", encoding="utf-8")
-    (latest / "result.json").write_text("{}", encoding="utf-8")
-    incomplete = jobs / "smoke" / "20260104-000000"
+def test_pier_run_helpers(tmp_path: Path):
+    old = _make_pier_run(tmp_path, "smoke", "20260102-000000")
+    latest = _make_pier_run(tmp_path, "smoke", "20260103-000000")
+    other = _make_pier_run(tmp_path, "other", "20260104-000000")
+    incomplete = tmp_path / "jobs" / "smoke" / "20260105-000000"
     incomplete.mkdir()
-    legacy = jobs / "legacy-job"
-    legacy.mkdir()
-    (legacy / "config.json").write_text("{}", encoding="utf-8")
-    (legacy / "result.json").write_text("{}", encoding="utf-8")
-    legacy_trial = legacy / "copilot-cli__task__1"
-    legacy_trial.mkdir()
-    (legacy_trial / "config.json").write_text("{}", encoding="utf-8")
-    (legacy_trial / "result.json").write_text("{}", encoding="utf-8")
+    flat_legacy = tmp_path / "jobs" / "legacy-job"
+    flat_legacy.mkdir()
+    (flat_legacy / "config.json").write_text("{}", encoding="utf-8")
+    (flat_legacy / "result.json").write_text("{}", encoding="utf-8")
 
     layout = Layout(tmp_path)
 
-    assert layout.iter_pier_jobs() == [legacy, good, latest]
-    assert layout.pier_job_key(good) == "smoke/20260102-000000"
+    assert layout.iter_pier_jobs() == [other, old, latest]
+    assert layout.pier_job_key(old) == "smoke/20260102-000000"
     assert layout.latest_pier_job() == latest
     assert layout.find_pier_job("smoke") == latest
-    assert layout.find_pier_job("smoke/20260102") == good
-    assert layout.find_pier_job("20260102") == good
-    assert layout.find_pier_job("legacy-job") == legacy
+    assert layout.find_pier_job("smoke/20260102") == old
+    assert layout.find_pier_job("20260102") == old
+    assert layout.find_pier_job("legacy-job") is None
     assert layout.find_pier_job("missing") is None
diff --git a/tests/test_workspace.py b/tests/test_workspace.py
deleted file mode 100644
index 4900a34..0000000
--- a/tests/test_workspace.py
+++ /dev/null
@@ -1,151 +0,0 @@
-"""Workspace provisioning and diff-capture tests.
-
-These prove the parts a no-op dry-run does *not*: that provisioning creates a
-real git baseline (a resolvable ``HEAD``) and that subsequent changes are either
-captured as a diff or surfaced as an error. This is the path that silently
-returned an empty ``workspace.diff`` on Windows (MAX_PATH), so the tests assert
-the baseline and diff explicitly rather than trusting an empty string.
-"""
-
-from __future__ import annotations
-
-import subprocess
-from pathlib import Path
-
-import pytest
-
-from copilot_experiments._util import force_rmtree
-from copilot_experiments.models import Task
-from copilot_experiments.workspace import WorkspaceError, capture_diff, provision
-
-
-def _make_fixture(root: Path, name: str = "fix") -> str:
-    d = root / "fixtures" / name
-    d.mkdir(parents=True)
-    (d / "hello.txt").write_text("hello\n", encoding="utf-8")
-    return f"fixtures/{name}"
-
-
-def _git(args: list[str], cwd: Path) -> subprocess.CompletedProcess:
-    return subprocess.run(["git", *args], cwd=str(cwd), capture_output=True, text=True)
-
-
-def test_provision_copies_fixture(tmp_path: Path):
-    fixture = _make_fixture(tmp_path)
-    ws = tmp_path / "ws"
-    provision(Task(prompt="p", fixture=fixture), ws, tmp_path)
-    assert (ws / "hello.txt").read_text(encoding="utf-8") == "hello\n"
-
-
-def test_provision_creates_resolvable_baseline(tmp_path: Path):
-    # The bug: a silently-failed baseline left no HEAD, so every diff was empty.
-    # Assert HEAD resolves and the tree is clean (everything committed).
-    fixture = _make_fixture(tmp_path)
-    ws = tmp_path / "ws"
-    provision(Task(prompt="p", fixture=fixture), ws, tmp_path)
-
-    head = _git(["rev-parse", "HEAD"], ws)
-    assert head.returncode == 0
-    assert head.stdout.strip()  # a real commit sha
-
-    status = _git(["status", "--porcelain"], ws)
-    assert status.stdout.strip() == ""
-
-
-def test_provision_runs_setup_commands(tmp_path: Path):
-    fixture = _make_fixture(tmp_path)
-    ws = tmp_path / "ws"
-    provision(
-        Task(prompt="p", fixture=fixture, setup=["echo seeded > SETUP_RAN"]),
-        ws,
-        tmp_path,
-    )
-    assert (ws / "SETUP_RAN").exists()
-
-
-def test_provision_failing_setup_raises(tmp_path: Path):
-    fixture = _make_fixture(tmp_path)
-    ws = tmp_path / "ws"
-    with pytest.raises(WorkspaceError):
-        provision(Task(prompt="p", fixture=fixture, setup=["exit 7"]), ws, tmp_path)
-
-
-def test_provision_rejects_fixture_and_repo(tmp_path: Path):
-    ws = tmp_path / "ws"
-    with pytest.raises(WorkspaceError):
-        provision(Task(prompt="p", fixture="x", repo="https://example/r.git"), ws, tmp_path)
-
-
-def test_provision_missing_fixture_raises(tmp_path: Path):
-    ws = tmp_path / "ws"
-    with pytest.raises(WorkspaceError):
-        provision(Task(prompt="p", fixture="fixtures/does-not-exist"), ws, tmp_path)
-
-
-def test_capture_diff_reflects_changes(tmp_path: Path):
-    fixture = _make_fixture(tmp_path)
-    ws = tmp_path / "ws"
-    provision(Task(prompt="p", fixture=fixture), ws, tmp_path)
-
-    # Emulate Copilot editing a file and adding a new one.
-    (ws / "hello.txt").write_text("hello world\n", encoding="utf-8")
-    (ws / "NEW.txt").write_text("brand new\n", encoding="utf-8")
-
-    diff = capture_diff(ws)
-    assert "hello world" in diff
-    assert "NEW.txt" in diff
-    assert diff.strip() != ""
-
-
-def test_capture_diff_empty_when_unchanged(tmp_path: Path):
-    fixture = _make_fixture(tmp_path)
-    ws = tmp_path / "ws"
-    provision(Task(prompt="p", fixture=fixture), ws, tmp_path)
-    assert capture_diff(ws) == ""
-
-
-def test_capture_diff_without_git_returns_empty(tmp_path: Path):
-    ws = tmp_path / "plain"
-    ws.mkdir()
-    assert capture_diff(ws) == ""
-
-
-def test_capture_diff_surfaces_git_failure(tmp_path: Path):
-    # A ``.git`` that is not a valid repository must raise, not silently return
-    # "" (the failure mode that hid the broken baseline).
-    ws = tmp_path / "broken"
-    ws.mkdir()
-    (ws / ".git").write_text("not a git repository", encoding="utf-8")
-    with pytest.raises(WorkspaceError):
-        capture_diff(ws)
-
-
-def test_provision_baseline_survives_deep_paths(tmp_path: Path):
-    # Regression for the Windows MAX_PATH failure. The *workspace* path stays
-    # under 260 (so plain Python can create it and copy fixtures into it), but the
-    # nested ``.git/objects/<..>`` files it writes cross 260 -- which only succeeds
-    # when git is invoked with core.longpaths=true. Before the fix this left no
-    # HEAD and every diff came back empty. On POSIX there is no 260-char limit, so
-    # this simply still passes.
-    fixture = _make_fixture(tmp_path)
-
-    deep = tmp_path
-    seg = "deeppath__"  # 10 chars per level
-    first_seg = tmp_path / seg
-    try:
-        # Grow until the workspace path is long enough that the git object files
-        # inside it (~+55 chars) will exceed 260, while the workspace path itself
-        # stays creatable by plain Python (< ~245).
-        while len(str(deep / seg / "workspace")) < 235:
-            deep = deep / seg
-            deep.mkdir()
-        ws = deep / "workspace"
-
-        provision(Task(prompt="p", fixture=fixture), ws, tmp_path)
-        head = _git(["rev-parse", "HEAD"], ws)
-        assert head.returncode == 0
-        assert head.stdout.strip()
-    finally:
-        # The deepest .git/objects paths exceed MAX_PATH, so a plain rmtree
-        # (pytest's teardown) would fail to delete them.
-        force_rmtree(first_seg)

From a7b23a7e71c949053f66a27c3618dd4fdd84deb5 Mon Sep 17 00:00:00 2001
From: Dominique Broeglin <dominique.broeglin@microsoft.com>
Date: Sun, 28 Jun 2026 11:48:01 +0200
Subject: [PATCH 3/3] Fix init scaffold project name

Forward --name through the scaffold helper's project_name parameter and cover the CLI init path.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 src/copilot_experiments/cli.py |  2 +-
 tests/test_scaffold.py         | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/copilot_experiments/cli.py b/src/copilot_experiments/cli.py
index 75b9d97..5640c5f 100644
--- a/src/copilot_experiments/cli.py
+++ b/src/copilot_experiments/cli.py
@@ -82,7 +82,7 @@ def init(
     """Scaffold a standalone Pier experiment repository."""
 
     try:
-        init_experiment_repo(directory, name=name, force=force)
+        init_experiment_repo(directory, project_name=name, force=force)
     except ScaffoldError as exc:
         err.print(f"[red]Scaffold error:[/red] {exc}")
         raise typer.Exit(1) from exc
diff --git a/tests/test_scaffold.py b/tests/test_scaffold.py
index 94689f1..c8d00ba 100644
--- a/tests/test_scaffold.py
+++ b/tests/test_scaffold.py
@@ -5,7 +5,9 @@
 from pathlib import Path
 
 import pytest
+from typer.testing import CliRunner
 
+from copilot_experiments.cli import app
 from copilot_experiments.scaffold import ScaffoldError, init_experiment_repo
 
 
@@ -47,3 +49,14 @@ def test_init_refuses_nonempty_without_force(tmp_path: Path):
     # With force it proceeds.
     created = init_experiment_repo(dest, force=True)
     assert created
+
+
+def test_cli_init_scaffolds_repository_with_name(tmp_path: Path):
+    dest = tmp_path / "generated"
+
+    result = CliRunner().invoke(app, ["init", str(dest), "--name", "custom-experiment"])
+
+    assert result.exit_code == 0, result.output
+    assert "Initialized" in result.output
+    pyproject = (dest / "pyproject.toml").read_text(encoding="utf-8")
+    assert 'name = "custom-experiment"' in pyproject