Separate pre-fix and post-fix proof history

TayDa64 · TayDa64 · commit 516a9163b74f · 2026-03-21T08:27:10.000-07:00
diff --git a/TESTING.md b/TESTING.md
@@ -166,6 +166,7 @@ npm run proof:inline -- --list-suites
 npm run proof:inline -- --suite repo-boundary-clarification
 npm run proof:inline -- --suite forgone-feature-suppression --models cheap,latest-gpt
 npm run proof:inline:summary -- --suite repo-boundary-clarification --days 7
+npm run proof:inline:summary -- --suite repo-boundary-clarification --cohort phase3-postfix
 node scripts/test-chat-inline-proof-evaluator.js
 ```
 
@@ -174,6 +175,7 @@ What this covers:
 - live transcript proof for repo-boundary corrections and forgone-feature suppression
 - model-bucket comparison using `cheap` and `latest-gpt`
 - JSONL summary of recent pass/fail trends by suite and model
+- cohort filtering to separate pre-fix history from post-fix Phase 3 runs
 - evaluator characterization for transcript expectations without needing a live model run
 
 ### Manual Checks for Model Selection
diff --git a/scripts/summarize-chat-inline-proof.js b/scripts/summarize-chat-inline-proof.js
@@ -5,6 +5,8 @@ const path = require('path');
 const { LIKU_HOME } = require(path.join(__dirname, '..', 'src', 'shared', 'liku-home.js'));
 
 const PROOF_RESULT_LOG = path.join(LIKU_HOME, 'telemetry', 'logs', 'chat-inline-proof-results.jsonl');
+const PHASE3_POSTFIX_STARTED_AT = '2026-03-21T05:17:35.645Z';
+const PHASE3_POSTFIX_STARTED_AT_MS = Date.parse(PHASE3_POSTFIX_STARTED_AT);
 
 function getArgValue(flagName) {
   const index = process.argv.indexOf(flagName);
@@ -41,10 +43,17 @@ function resolveEntryModel(entry) {
   return entry?.requestedModel || entry?.observedRequestedModels?.[0] || entry?.observedRuntimeModels?.[0] || 'default';
 }
 
+function resolveEntryCohort(entry) {
+  const timestamp = Date.parse(entry?.timestamp || '');
+  if (!Number.isFinite(timestamp)) return 'unknown';
+  return timestamp >= PHASE3_POSTFIX_STARTED_AT_MS ? 'phase3-postfix' : 'pre-phase3-postfix';
+}
+
 function passesFilter(entry, filters = {}) {
   if (filters.suite && entry.suite !== filters.suite) return false;
   if (filters.model && resolveEntryModel(entry) !== filters.model) return false;
   if (filters.mode && entry.mode !== filters.mode) return false;
+  if (filters.cohort && resolveEntryCohort(entry) !== filters.cohort) return false;
   if (filters.since) {
     const timestamp = Date.parse(entry.timestamp || '');
     if (!Number.isFinite(timestamp) || timestamp < filters.since) return false;
@@ -73,13 +82,15 @@ function summarizeProofEntries(entries) {
   const bySuite = new Map();
   const byModel = new Map();
   const bySuiteModel = new Map();
+  const byCohort = new Map();
 
   for (const entry of normalized) {
     const suiteKey = entry.suite || 'unknown';
     const modelKey = resolveEntryModel(entry);
+    const cohortKey = resolveEntryCohort(entry);
     const suiteModelKey = `${suiteKey}::${modelKey}`;
 
-    for (const [bucket, key] of [[bySuite, suiteKey], [byModel, modelKey], [bySuiteModel, suiteModelKey]]) {
+    for (const [bucket, key] of [[bySuite, suiteKey], [byModel, modelKey], [bySuiteModel, suiteModelKey], [byCohort, cohortKey]]) {
       if (!bucket.has(key)) bucket.set(key, []);
       bucket.get(key).push(entry);
     }
@@ -91,6 +102,7 @@ function summarizeProofEntries(entries) {
 
   return {
     totals,
+    phase3PostfixStartedAt: PHASE3_POSTFIX_STARTED_AT,
     bySuite: materialize(bySuite, (key, bucketEntries) => {
       const passed = bucketEntries.filter((entry) => entry.passed).length;
       return {
@@ -117,6 +129,19 @@ function summarizeProofEntries(entries) {
         runtimeModels: [...new Set(bucketEntries.flatMap((entry) => entry.observedRuntimeModels || []))].sort()
       };
     }),
+    byCohort: materialize(byCohort, (key, bucketEntries) => {
+      const passed = bucketEntries.filter((entry) => entry.passed).length;
+      return {
+        key,
+        runs: bucketEntries.length,
+        passed,
+        failed: bucketEntries.length - passed,
+        passRate: Number(((passed / bucketEntries.length) * 100).toFixed(1)),
+        trend: buildTrend(bucketEntries),
+        lastRunAt: bucketEntries[0]?.timestamp || null,
+        models: [...new Set(bucketEntries.map((entry) => resolveEntryModel(entry)))].sort()
+      };
+    }),
     bySuiteModel: materialize(bySuiteModel, (key, bucketEntries) => {
       const [suite, model] = key.split('::');
       const passed = bucketEntries.filter((entry) => entry.passed).length;
@@ -151,13 +176,19 @@ function main() {
   const suite = getArgValue('--suite') || null;
   const model = getArgValue('--model') || null;
   const mode = getArgValue('--mode') || null;
+  const rawSince = getArgValue('--since');
+  const cohort = hasFlag('--phase3-postfix') ? 'phase3-postfix' : (getArgValue('--cohort') || null);
   const limit = Math.max(1, parseInt(getArgValue('--limit'), 10) || 10);
   const days = Math.max(0, parseInt(getArgValue('--days'), 10) || 0);
+  const since = rawSince ? Date.parse(rawSince) : null;
   const filters = {
     suite,
     model,
     mode,
-    since: days > 0 ? Date.now() - (days * 24 * 60 * 60 * 1000) : null
+    cohort,
+    since: Number.isFinite(since)
+      ? since
+      : (days > 0 ? Date.now() - (days * 24 * 60 * 60 * 1000) : null)
   };
 
   const entries = parseProofEntries().filter((entry) => passesFilter(entry, filters));
@@ -181,6 +212,14 @@ function main() {
 
   console.log('Inline Chat Proof Summary');
   console.log(`Runs: ${summary.totals.runs} | Passed: ${summary.totals.passed} | Failed: ${summary.totals.failed} | Pass rate: ${formatPercent(summary.totals.passRate)}`);
+  if (!filters.cohort) {
+    console.log(`Phase 3 post-fix cohort starts at: ${summary.phase3PostfixStartedAt}`);
+  }
+
+  printGroup('By Cohort', summary.byCohort.slice(0, limit), (row) => {
+    const models = row.models.length ? ` | models=${row.models.join(',')}` : '';
+    return `- ${row.key}: ${row.passed}/${row.runs} passed (${formatPercent(row.passRate)}) | trend=${row.trend || '-'}${models}`;
+  });
 
   printGroup('By Suite', summary.bySuite.slice(0, limit), (row) => {
     const models = row.models.length ? ` | models=${row.models.join(',')}` : '';
@@ -202,8 +241,10 @@ if (require.main === module) {
 }
 
 module.exports = {
+  PHASE3_POSTFIX_STARTED_AT,
   PROOF_RESULT_LOG,
   parseProofEntries,
+  resolveEntryCohort,
   resolveEntryModel,
   summarizeProofEntries,
   buildTrend,
diff --git a/scripts/test-chat-inline-proof-summary.js b/scripts/test-chat-inline-proof-summary.js
@@ -6,7 +6,9 @@ const os = require('os');
 const path = require('path');
 
 const {
+  PHASE3_POSTFIX_STARTED_AT,
   parseProofEntries,
+  resolveEntryCohort,
   resolveEntryModel,
   summarizeProofEntries,
   buildTrend,
@@ -40,6 +42,11 @@ test('resolveEntryModel prefers requested model bucket', () => {
   assert.strictEqual(resolveEntryModel({}), 'default');
 });
 
+test('resolveEntryCohort separates pre-fix and post-fix Phase 3 runs', () => {
+  assert.strictEqual(resolveEntryCohort({ timestamp: '2026-03-21T05:10:42.757Z' }), 'pre-phase3-postfix');
+  assert.strictEqual(resolveEntryCohort({ timestamp: PHASE3_POSTFIX_STARTED_AT }), 'phase3-postfix');
+});
+
 test('summarizeProofEntries groups by suite and model with trends', () => {
   const entries = [
     { timestamp: '2026-03-20T00:00:00.000Z', suite: 'direct-navigation', requestedModel: 'cheap', passed: true, observedRuntimeModels: ['gpt-4o-mini'] },
@@ -53,6 +60,7 @@ test('summarizeProofEntries groups by suite and model with trends', () => {
   assert.strictEqual(summary.totals.passed, 3);
   assert(summary.bySuite.some((row) => row.key === 'direct-navigation' && row.trend === 'PFP'));
   assert(summary.byModel.some((row) => row.key === 'cheap' && row.trend === 'PF'));
+  assert(summary.byCohort.some((row) => row.key === 'pre-phase3-postfix'));
   assert(summary.bySuiteModel.some((row) => row.suite === 'direct-navigation' && row.model === 'latest-gpt' && row.passRate === 100));
 });
 
@@ -62,6 +70,8 @@ test('passesFilter respects suite model mode and time filters', () => {
   assert.strictEqual(passesFilter(entry, { suite: 'other' }), false);
   assert.strictEqual(passesFilter(entry, { model: 'cheap' }), false);
   assert.strictEqual(passesFilter(entry, { mode: 'global' }), false);
+  assert.strictEqual(passesFilter({ timestamp: PHASE3_POSTFIX_STARTED_AT }, { cohort: 'phase3-postfix' }), true);
+  assert.strictEqual(passesFilter({ timestamp: '2026-03-21T05:10:42.757Z' }, { cohort: 'phase3-postfix' }), false);
   assert.strictEqual(passesFilter(entry, { since: Date.parse('2026-03-21T00:00:00.000Z') }), false);
 });