rebase generation
This commit is contained in:
184
.claude/worktrees/goofy-haslett/tools/eval/run-evals.mjs
Normal file
184
.claude/worktrees/goofy-haslett/tools/eval/run-evals.mjs
Normal file
@@ -0,0 +1,184 @@
|
||||
#!/usr/bin/env node
|
||||
/**
|
||||
* tools/eval/run-evals.mjs
|
||||
*
|
||||
* Rule 6 — Eval harness: fixture-based regression tests for generated artifacts.
|
||||
*
|
||||
* Philosophy:
|
||||
* - Evals are the test suite for prompts. Never ship a prompt change without
|
||||
* running evals first.
|
||||
* - Use deterministic pattern/regex checks ("reference-free" grading) rather
|
||||
* than golden snapshot comparison. Patterns are maintainable; snapshots are
|
||||
* brittle.
|
||||
* - Eval-driven development: write a failing eval FIRST, then update the prompt
|
||||
* or re-generate to make it pass.
|
||||
*
|
||||
* Usage:
|
||||
* node tools/eval/run-evals.mjs # run all fixtures
|
||||
* node tools/eval/run-evals.mjs --entity equipment
|
||||
* node tools/eval/run-evals.mjs --verbose
|
||||
*/
|
||||
|
||||
import { existsSync, readFileSync, readdirSync } from 'node:fs';
|
||||
import path from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
const rootDir = path.resolve(__dirname, '../..');
|
||||
const fixturesDir = path.join(__dirname, 'fixtures');
|
||||
|
||||
const args = new Set(process.argv.slice(2));
|
||||
const verbose = args.has('--verbose') || args.has('-v');
|
||||
const entityFilter = (() => {
|
||||
const idx = process.argv.indexOf('--entity');
|
||||
return idx !== -1 ? process.argv[idx + 1] : null;
|
||||
})();
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Assertion engine
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
let totalChecks = 0;
|
||||
let totalFailures = 0;
|
||||
const failures = [];
|
||||
|
||||
function readArtifact(relativePath) {
|
||||
const filePath = path.join(rootDir, relativePath);
|
||||
if (!existsSync(filePath)) return null;
|
||||
return readFileSync(filePath, 'utf8');
|
||||
}
|
||||
|
||||
function runFileAssertions(filePath, fileSpec, entityLabel) {
|
||||
const content = readArtifact(filePath);
|
||||
|
||||
if (content === null) {
|
||||
totalChecks++;
|
||||
totalFailures++;
|
||||
failures.push({ entity: entityLabel, file: filePath, check: 'file-exists', result: 'FAIL', detail: `File not found: ${filePath}` });
|
||||
return;
|
||||
}
|
||||
|
||||
if (verbose) {
|
||||
console.log(` [${entityLabel}] Checking ${filePath}`);
|
||||
}
|
||||
|
||||
for (const expected of fileSpec.must_contain ?? []) {
|
||||
totalChecks++;
|
||||
if (!content.includes(expected)) {
|
||||
totalFailures++;
|
||||
failures.push({ entity: entityLabel, file: filePath, check: 'must_contain', result: 'FAIL', detail: `Missing: ${expected}` });
|
||||
}
|
||||
}
|
||||
|
||||
for (const forbidden of fileSpec.must_not_contain ?? []) {
|
||||
totalChecks++;
|
||||
if (content.includes(forbidden)) {
|
||||
totalFailures++;
|
||||
failures.push({ entity: entityLabel, file: filePath, check: 'must_not_contain', result: 'FAIL', detail: `Forbidden pattern found: ${forbidden}` });
|
||||
}
|
||||
}
|
||||
|
||||
for (const patternStr of fileSpec.must_match_regex ?? []) {
|
||||
totalChecks++;
|
||||
try {
|
||||
const re = new RegExp(patternStr);
|
||||
if (!re.test(content)) {
|
||||
totalFailures++;
|
||||
failures.push({ entity: entityLabel, file: filePath, check: 'must_match_regex', result: 'FAIL', detail: `Regex not matched: ${patternStr}` });
|
||||
}
|
||||
} catch (e) {
|
||||
totalFailures++;
|
||||
failures.push({ entity: entityLabel, file: filePath, check: 'must_match_regex', result: 'ERROR', detail: `Bad regex: ${patternStr} — ${e.message}` });
|
||||
}
|
||||
}
|
||||
|
||||
for (const patternStr of fileSpec.must_not_match_regex ?? []) {
|
||||
totalChecks++;
|
||||
try {
|
||||
const re = new RegExp(patternStr);
|
||||
if (re.test(content)) {
|
||||
totalFailures++;
|
||||
failures.push({ entity: entityLabel, file: filePath, check: 'must_not_match_regex', result: 'FAIL', detail: `Forbidden regex matched: ${patternStr}` });
|
||||
}
|
||||
} catch (e) {
|
||||
totalFailures++;
|
||||
failures.push({ entity: entityLabel, file: filePath, check: 'must_not_match_regex', result: 'ERROR', detail: `Bad regex: ${patternStr} — ${e.message}` });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function runFixture(fixtureDir) {
|
||||
const metaPath = path.join(fixtureDir, 'meta.json');
|
||||
if (!existsSync(metaPath)) return;
|
||||
|
||||
const meta = JSON.parse(readFileSync(metaPath, 'utf8'));
|
||||
const { entity, kebab } = meta;
|
||||
|
||||
if (entityFilter && kebab !== entityFilter && entity.toLowerCase() !== entityFilter.toLowerCase()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (verbose) {
|
||||
console.log(`\n[EVAL] ${entity} — ${meta.description ?? ''}`);
|
||||
}
|
||||
|
||||
const backendPath = path.join(fixtureDir, 'backend.assertions.json');
|
||||
if (existsSync(backendPath)) {
|
||||
const spec = JSON.parse(readFileSync(backendPath, 'utf8'));
|
||||
for (const [key, fileSpec] of Object.entries(spec.files ?? {})) {
|
||||
runFileAssertions(fileSpec.path, fileSpec, `${entity}/${key}`);
|
||||
}
|
||||
}
|
||||
|
||||
const frontendPath = path.join(fixtureDir, 'frontend.assertions.json');
|
||||
if (existsSync(frontendPath)) {
|
||||
const spec = JSON.parse(readFileSync(frontendPath, 'utf8'));
|
||||
for (const [key, fileSpec] of Object.entries(spec.files ?? {})) {
|
||||
runFileAssertions(fileSpec.path, fileSpec, `${entity}/${key}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Main
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const fixtureDirs = readdirSync(fixturesDir, { withFileTypes: true })
|
||||
.filter((d) => d.isDirectory())
|
||||
.map((d) => path.join(fixturesDir, d.name));
|
||||
|
||||
for (const dir of fixtureDirs) {
|
||||
runFixture(dir);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Report
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
console.log('');
|
||||
console.log('══════════════════════════════════════════════');
|
||||
console.log(' KIS-TOiR Eval Report');
|
||||
console.log('══════════════════════════════════════════════');
|
||||
console.log(` Fixtures: ${fixtureDirs.length}`);
|
||||
console.log(` Checks: ${totalChecks}`);
|
||||
console.log(` Passed: ${totalChecks - totalFailures}`);
|
||||
console.log(` Failed: ${totalFailures}`);
|
||||
console.log('══════════════════════════════════════════════');
|
||||
|
||||
if (failures.length > 0) {
|
||||
console.log('');
|
||||
console.log('Failures:');
|
||||
for (const f of failures) {
|
||||
console.log(` [${f.result}] ${f.entity} — ${f.file}`);
|
||||
console.log(` ${f.check}: ${f.detail}`);
|
||||
}
|
||||
console.log('');
|
||||
console.log('To fix: update the prompt or re-generate the failing entity, then re-run evals.');
|
||||
console.log('To update a fixture (intentional change): edit tools/eval/fixtures/<entity>/*.assertions.json');
|
||||
console.log('');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log('');
|
||||
console.log('All evals passed.');
|
||||
console.log('');
|
||||
Reference in New Issue
Block a user