Skip to content

Commit 516a916

Browse files
committed
Separate pre-fix and post-fix proof history
1 parent 289dc6a commit 516a916

3 files changed

Lines changed: 55 additions & 2 deletions

File tree

TESTING.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ npm run proof:inline -- --list-suites
166166
npm run proof:inline -- --suite repo-boundary-clarification
167167
npm run proof:inline -- --suite forgone-feature-suppression --models cheap,latest-gpt
168168
npm run proof:inline:summary -- --suite repo-boundary-clarification --days 7
169+
npm run proof:inline:summary -- --suite repo-boundary-clarification --cohort phase3-postfix
169170
node scripts/test-chat-inline-proof-evaluator.js
170171
```
171172

@@ -174,6 +175,7 @@ What this covers:
174175
- live transcript proof for repo-boundary corrections and forgone-feature suppression
175176
- model-bucket comparison using `cheap` and `latest-gpt`
176177
- JSONL summary of recent pass/fail trends by suite and model
178+
- cohort filtering to separate pre-fix history from post-fix Phase 3 runs
177179
- evaluator characterization for transcript expectations without needing a live model run
178180

179181
### Manual Checks for Model Selection

scripts/summarize-chat-inline-proof.js

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ const path = require('path');
55
const { LIKU_HOME } = require(path.join(__dirname, '..', 'src', 'shared', 'liku-home.js'));
66

77
const PROOF_RESULT_LOG = path.join(LIKU_HOME, 'telemetry', 'logs', 'chat-inline-proof-results.jsonl');
8+
const PHASE3_POSTFIX_STARTED_AT = '2026-03-21T05:17:35.645Z';
9+
const PHASE3_POSTFIX_STARTED_AT_MS = Date.parse(PHASE3_POSTFIX_STARTED_AT);
810

911
function getArgValue(flagName) {
1012
const index = process.argv.indexOf(flagName);
@@ -41,10 +43,17 @@ function resolveEntryModel(entry) {
4143
return entry?.requestedModel || entry?.observedRequestedModels?.[0] || entry?.observedRuntimeModels?.[0] || 'default';
4244
}
4345

46+
function resolveEntryCohort(entry) {
47+
const timestamp = Date.parse(entry?.timestamp || '');
48+
if (!Number.isFinite(timestamp)) return 'unknown';
49+
return timestamp >= PHASE3_POSTFIX_STARTED_AT_MS ? 'phase3-postfix' : 'pre-phase3-postfix';
50+
}
51+
4452
function passesFilter(entry, filters = {}) {
4553
if (filters.suite && entry.suite !== filters.suite) return false;
4654
if (filters.model && resolveEntryModel(entry) !== filters.model) return false;
4755
if (filters.mode && entry.mode !== filters.mode) return false;
56+
if (filters.cohort && resolveEntryCohort(entry) !== filters.cohort) return false;
4857
if (filters.since) {
4958
const timestamp = Date.parse(entry.timestamp || '');
5059
if (!Number.isFinite(timestamp) || timestamp < filters.since) return false;
@@ -73,13 +82,15 @@ function summarizeProofEntries(entries) {
7382
const bySuite = new Map();
7483
const byModel = new Map();
7584
const bySuiteModel = new Map();
85+
const byCohort = new Map();
7686

7787
for (const entry of normalized) {
7888
const suiteKey = entry.suite || 'unknown';
7989
const modelKey = resolveEntryModel(entry);
90+
const cohortKey = resolveEntryCohort(entry);
8091
const suiteModelKey = `${suiteKey}::${modelKey}`;
8192

82-
for (const [bucket, key] of [[bySuite, suiteKey], [byModel, modelKey], [bySuiteModel, suiteModelKey]]) {
93+
for (const [bucket, key] of [[bySuite, suiteKey], [byModel, modelKey], [bySuiteModel, suiteModelKey], [byCohort, cohortKey]]) {
8394
if (!bucket.has(key)) bucket.set(key, []);
8495
bucket.get(key).push(entry);
8596
}
@@ -91,6 +102,7 @@ function summarizeProofEntries(entries) {
91102

92103
return {
93104
totals,
105+
phase3PostfixStartedAt: PHASE3_POSTFIX_STARTED_AT,
94106
bySuite: materialize(bySuite, (key, bucketEntries) => {
95107
const passed = bucketEntries.filter((entry) => entry.passed).length;
96108
return {
@@ -117,6 +129,19 @@ function summarizeProofEntries(entries) {
117129
runtimeModels: [...new Set(bucketEntries.flatMap((entry) => entry.observedRuntimeModels || []))].sort()
118130
};
119131
}),
132+
byCohort: materialize(byCohort, (key, bucketEntries) => {
133+
const passed = bucketEntries.filter((entry) => entry.passed).length;
134+
return {
135+
key,
136+
runs: bucketEntries.length,
137+
passed,
138+
failed: bucketEntries.length - passed,
139+
passRate: Number(((passed / bucketEntries.length) * 100).toFixed(1)),
140+
trend: buildTrend(bucketEntries),
141+
lastRunAt: bucketEntries[0]?.timestamp || null,
142+
models: [...new Set(bucketEntries.map((entry) => resolveEntryModel(entry)))].sort()
143+
};
144+
}),
120145
bySuiteModel: materialize(bySuiteModel, (key, bucketEntries) => {
121146
const [suite, model] = key.split('::');
122147
const passed = bucketEntries.filter((entry) => entry.passed).length;
@@ -151,13 +176,19 @@ function main() {
151176
const suite = getArgValue('--suite') || null;
152177
const model = getArgValue('--model') || null;
153178
const mode = getArgValue('--mode') || null;
179+
const rawSince = getArgValue('--since');
180+
const cohort = hasFlag('--phase3-postfix') ? 'phase3-postfix' : (getArgValue('--cohort') || null);
154181
const limit = Math.max(1, parseInt(getArgValue('--limit'), 10) || 10);
155182
const days = Math.max(0, parseInt(getArgValue('--days'), 10) || 0);
183+
const since = rawSince ? Date.parse(rawSince) : null;
156184
const filters = {
157185
suite,
158186
model,
159187
mode,
160-
since: days > 0 ? Date.now() - (days * 24 * 60 * 60 * 1000) : null
188+
cohort,
189+
since: Number.isFinite(since)
190+
? since
191+
: (days > 0 ? Date.now() - (days * 24 * 60 * 60 * 1000) : null)
161192
};
162193

163194
const entries = parseProofEntries().filter((entry) => passesFilter(entry, filters));
@@ -181,6 +212,14 @@ function main() {
181212

182213
console.log('Inline Chat Proof Summary');
183214
console.log(`Runs: ${summary.totals.runs} | Passed: ${summary.totals.passed} | Failed: ${summary.totals.failed} | Pass rate: ${formatPercent(summary.totals.passRate)}`);
215+
if (!filters.cohort) {
216+
console.log(`Phase 3 post-fix cohort starts at: ${summary.phase3PostfixStartedAt}`);
217+
}
218+
219+
printGroup('By Cohort', summary.byCohort.slice(0, limit), (row) => {
220+
const models = row.models.length ? ` | models=${row.models.join(',')}` : '';
221+
return `- ${row.key}: ${row.passed}/${row.runs} passed (${formatPercent(row.passRate)}) | trend=${row.trend || '-'}${models}`;
222+
});
184223

185224
printGroup('By Suite', summary.bySuite.slice(0, limit), (row) => {
186225
const models = row.models.length ? ` | models=${row.models.join(',')}` : '';
@@ -202,8 +241,10 @@ if (require.main === module) {
202241
}
203242

204243
module.exports = {
244+
PHASE3_POSTFIX_STARTED_AT,
205245
PROOF_RESULT_LOG,
206246
parseProofEntries,
247+
resolveEntryCohort,
207248
resolveEntryModel,
208249
summarizeProofEntries,
209250
buildTrend,

scripts/test-chat-inline-proof-summary.js

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ const os = require('os');
66
const path = require('path');
77

88
const {
9+
PHASE3_POSTFIX_STARTED_AT,
910
parseProofEntries,
11+
resolveEntryCohort,
1012
resolveEntryModel,
1113
summarizeProofEntries,
1214
buildTrend,
@@ -40,6 +42,11 @@ test('resolveEntryModel prefers requested model bucket', () => {
4042
assert.strictEqual(resolveEntryModel({}), 'default');
4143
});
4244

45+
test('resolveEntryCohort separates pre-fix and post-fix Phase 3 runs', () => {
46+
assert.strictEqual(resolveEntryCohort({ timestamp: '2026-03-21T05:10:42.757Z' }), 'pre-phase3-postfix');
47+
assert.strictEqual(resolveEntryCohort({ timestamp: PHASE3_POSTFIX_STARTED_AT }), 'phase3-postfix');
48+
});
49+
4350
test('summarizeProofEntries groups by suite and model with trends', () => {
4451
const entries = [
4552
{ timestamp: '2026-03-20T00:00:00.000Z', suite: 'direct-navigation', requestedModel: 'cheap', passed: true, observedRuntimeModels: ['gpt-4o-mini'] },
@@ -53,6 +60,7 @@ test('summarizeProofEntries groups by suite and model with trends', () => {
5360
assert.strictEqual(summary.totals.passed, 3);
5461
assert(summary.bySuite.some((row) => row.key === 'direct-navigation' && row.trend === 'PFP'));
5562
assert(summary.byModel.some((row) => row.key === 'cheap' && row.trend === 'PF'));
63+
assert(summary.byCohort.some((row) => row.key === 'pre-phase3-postfix'));
5664
assert(summary.bySuiteModel.some((row) => row.suite === 'direct-navigation' && row.model === 'latest-gpt' && row.passRate === 100));
5765
});
5866

@@ -62,6 +70,8 @@ test('passesFilter respects suite model mode and time filters', () => {
6270
assert.strictEqual(passesFilter(entry, { suite: 'other' }), false);
6371
assert.strictEqual(passesFilter(entry, { model: 'cheap' }), false);
6472
assert.strictEqual(passesFilter(entry, { mode: 'global' }), false);
73+
assert.strictEqual(passesFilter({ timestamp: PHASE3_POSTFIX_STARTED_AT }, { cohort: 'phase3-postfix' }), true);
74+
assert.strictEqual(passesFilter({ timestamp: '2026-03-21T05:10:42.757Z' }, { cohort: 'phase3-postfix' }), false);
6575
assert.strictEqual(passesFilter(entry, { since: Date.parse('2026-03-21T00:00:00.000Z') }), false);
6676
});
6777

0 commit comments

Comments
 (0)