#!/usr/bin/env node /** * tools/eval/run-evals.mjs * * Rule 6 — Eval harness: fixture-based regression tests for generated artifacts. * * Philosophy: * - Evals are the test suite for prompts. Never ship a prompt change without * running evals first. * - Use deterministic pattern/regex checks ("reference-free" grading) rather * than golden snapshot comparison. Patterns are maintainable; snapshots are * brittle. * - Eval-driven development: write a failing eval FIRST, then update the prompt * or re-generate to make it pass. * * Usage: * node tools/eval/run-evals.mjs # run all fixtures * node tools/eval/run-evals.mjs --entity equipment * node tools/eval/run-evals.mjs --verbose */ import { existsSync, readFileSync, readdirSync } from 'node:fs'; import path from 'node:path'; import { fileURLToPath } from 'node:url'; const __dirname = path.dirname(fileURLToPath(import.meta.url)); const rootDir = path.resolve(__dirname, '../..'); const fixturesDir = path.join(__dirname, 'fixtures'); const args = new Set(process.argv.slice(2)); const verbose = args.has('--verbose') || args.has('-v'); const entityFilter = (() => { const idx = process.argv.indexOf('--entity'); return idx !== -1 ? process.argv[idx + 1] : null; })(); // --------------------------------------------------------------------------- // Assertion engine // --------------------------------------------------------------------------- let totalChecks = 0; let totalFailures = 0; const failures = []; function readArtifact(relativePath) { const filePath = path.join(rootDir, relativePath); if (!existsSync(filePath)) return null; return readFileSync(filePath, 'utf8'); } function runFileAssertions(filePath, fileSpec, entityLabel) { const content = readArtifact(filePath); if (content === null) { totalChecks++; totalFailures++; failures.push({ entity: entityLabel, file: filePath, check: 'file-exists', result: 'FAIL', detail: `File not found: ${filePath}` }); return; } if (verbose) { console.log(` [${entityLabel}] Checking ${filePath}`); } for (const expected of fileSpec.must_contain ?? []) { totalChecks++; if (!content.includes(expected)) { totalFailures++; failures.push({ entity: entityLabel, file: filePath, check: 'must_contain', result: 'FAIL', detail: `Missing: ${expected}` }); } } for (const forbidden of fileSpec.must_not_contain ?? []) { totalChecks++; if (content.includes(forbidden)) { totalFailures++; failures.push({ entity: entityLabel, file: filePath, check: 'must_not_contain', result: 'FAIL', detail: `Forbidden pattern found: ${forbidden}` }); } } for (const patternStr of fileSpec.must_match_regex ?? []) { totalChecks++; try { const re = new RegExp(patternStr); if (!re.test(content)) { totalFailures++; failures.push({ entity: entityLabel, file: filePath, check: 'must_match_regex', result: 'FAIL', detail: `Regex not matched: ${patternStr}` }); } } catch (e) { totalFailures++; failures.push({ entity: entityLabel, file: filePath, check: 'must_match_regex', result: 'ERROR', detail: `Bad regex: ${patternStr} — ${e.message}` }); } } for (const patternStr of fileSpec.must_not_match_regex ?? []) { totalChecks++; try { const re = new RegExp(patternStr); if (re.test(content)) { totalFailures++; failures.push({ entity: entityLabel, file: filePath, check: 'must_not_match_regex', result: 'FAIL', detail: `Forbidden regex matched: ${patternStr}` }); } } catch (e) { totalFailures++; failures.push({ entity: entityLabel, file: filePath, check: 'must_not_match_regex', result: 'ERROR', detail: `Bad regex: ${patternStr} — ${e.message}` }); } } } function runFixture(fixtureDir) { const metaPath = path.join(fixtureDir, 'meta.json'); if (!existsSync(metaPath)) return; const meta = JSON.parse(readFileSync(metaPath, 'utf8')); const { entity, kebab } = meta; if (entityFilter && kebab !== entityFilter && entity.toLowerCase() !== entityFilter.toLowerCase()) { return; } if (verbose) { console.log(`\n[EVAL] ${entity} — ${meta.description ?? ''}`); } const backendPath = path.join(fixtureDir, 'backend.assertions.json'); if (existsSync(backendPath)) { const spec = JSON.parse(readFileSync(backendPath, 'utf8')); for (const [key, fileSpec] of Object.entries(spec.files ?? {})) { runFileAssertions(fileSpec.path, fileSpec, `${entity}/${key}`); } } const frontendPath = path.join(fixtureDir, 'frontend.assertions.json'); if (existsSync(frontendPath)) { const spec = JSON.parse(readFileSync(frontendPath, 'utf8')); for (const [key, fileSpec] of Object.entries(spec.files ?? {})) { runFileAssertions(fileSpec.path, fileSpec, `${entity}/${key}`); } } } // --------------------------------------------------------------------------- // Main // --------------------------------------------------------------------------- const fixtureDirs = readdirSync(fixturesDir, { withFileTypes: true }) .filter((d) => d.isDirectory()) .map((d) => path.join(fixturesDir, d.name)); for (const dir of fixtureDirs) { runFixture(dir); } // --------------------------------------------------------------------------- // Report // --------------------------------------------------------------------------- console.log(''); console.log('══════════════════════════════════════════════'); console.log(' KIS-TOiR Eval Report'); console.log('══════════════════════════════════════════════'); console.log(` Fixtures: ${fixtureDirs.length}`); console.log(` Checks: ${totalChecks}`); console.log(` Passed: ${totalChecks - totalFailures}`); console.log(` Failed: ${totalFailures}`); console.log('══════════════════════════════════════════════'); if (failures.length > 0) { console.log(''); console.log('Failures:'); for (const f of failures) { console.log(` [${f.result}] ${f.entity} — ${f.file}`); console.log(` ${f.check}: ${f.detail}`); } console.log(''); console.log('To fix: update the prompt or re-generate the failing entity, then re-run evals.'); console.log('To update a fixture (intentional change): edit tools/eval/fixtures//*.assertions.json'); console.log(''); process.exit(1); } console.log(''); console.log('All evals passed.'); console.log('');