KIS-TOiR/tools/eval/run-evals.mjs

#!/usr/bin/env node
/**
 * tools/eval/run-evals.mjs
 *
 * Rule 6 — Eval harness: fixture-based regression tests for generated artifacts.
 *
 * Philosophy:
 *   - Evals are the test suite for prompts. Never ship a prompt change without
 *     running evals first.
 *   - Use deterministic pattern/regex checks ("reference-free" grading) rather
 *     than golden snapshot comparison. Patterns are maintainable; snapshots are
 *     brittle.
 *   - Eval-driven development: write a failing eval FIRST, then update the prompt
 *     or re-generate to make it pass.
 *
 * Usage:
 *   node tools/eval/run-evals.mjs              # run all fixtures
 *   node tools/eval/run-evals.mjs --entity equipment
 *   node tools/eval/run-evals.mjs --verbose
 */

import { existsSync, readFileSync, readdirSync } from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'node:url';

const __dirname = path.dirname(fileURLToPath(import.meta.url));
const rootDir = path.resolve(__dirname, '../..');
const fixturesDir = path.join(__dirname, 'fixtures');

const args = new Set(process.argv.slice(2));
const verbose = args.has('--verbose') || args.has('-v');
const entityFilter = (() => {
  const idx = process.argv.indexOf('--entity');
  return idx !== -1 ? process.argv[idx + 1] : null;
})();

// ---------------------------------------------------------------------------
// Assertion engine
// ---------------------------------------------------------------------------

let totalChecks = 0;
let totalFailures = 0;
const failures = [];

function readArtifact(relativePath) {
  const filePath = path.join(rootDir, relativePath);
  if (!existsSync(filePath)) return null;
  return readFileSync(filePath, 'utf8');
}

function runFileAssertions(filePath, fileSpec, entityLabel) {
  const content = readArtifact(filePath);

  if (content === null) {
    totalChecks++;
    totalFailures++;
    failures.push({ entity: entityLabel, file: filePath, check: 'file-exists', result: 'FAIL', detail: `File not found: ${filePath}` });
    return;
  }

  if (verbose) {
    console.log(`  [${entityLabel}] Checking ${filePath}`);
  }

  for (const expected of fileSpec.must_contain ?? []) {
    totalChecks++;
    if (!content.includes(expected)) {
      totalFailures++;
      failures.push({ entity: entityLabel, file: filePath, check: 'must_contain', result: 'FAIL', detail: `Missing: ${expected}` });
    }
  }

  for (const forbidden of fileSpec.must_not_contain ?? []) {
    totalChecks++;
    if (content.includes(forbidden)) {
      totalFailures++;
      failures.push({ entity: entityLabel, file: filePath, check: 'must_not_contain', result: 'FAIL', detail: `Forbidden pattern found: ${forbidden}` });
    }
  }

  for (const patternStr of fileSpec.must_match_regex ?? []) {
    totalChecks++;
    try {
      const re = new RegExp(patternStr);
      if (!re.test(content)) {
        totalFailures++;
        failures.push({ entity: entityLabel, file: filePath, check: 'must_match_regex', result: 'FAIL', detail: `Regex not matched: ${patternStr}` });
      }
    } catch (e) {
      totalFailures++;
      failures.push({ entity: entityLabel, file: filePath, check: 'must_match_regex', result: 'ERROR', detail: `Bad regex: ${patternStr} — ${e.message}` });
    }
  }

  for (const patternStr of fileSpec.must_not_match_regex ?? []) {
    totalChecks++;
    try {
      const re = new RegExp(patternStr);
      if (re.test(content)) {
        totalFailures++;
        failures.push({ entity: entityLabel, file: filePath, check: 'must_not_match_regex', result: 'FAIL', detail: `Forbidden regex matched: ${patternStr}` });
      }
    } catch (e) {
      totalFailures++;
      failures.push({ entity: entityLabel, file: filePath, check: 'must_not_match_regex', result: 'ERROR', detail: `Bad regex: ${patternStr} — ${e.message}` });
    }
  }
}

function runFixture(fixtureDir) {
  const metaPath = path.join(fixtureDir, 'meta.json');
  if (!existsSync(metaPath)) return;

  const meta = JSON.parse(readFileSync(metaPath, 'utf8'));
  const { entity, kebab } = meta;

  if (entityFilter && kebab !== entityFilter && entity.toLowerCase() !== entityFilter.toLowerCase()) {
    return;
  }

  if (verbose) {
    console.log(`\n[EVAL] ${entity} — ${meta.description ?? ''}`);
  }

  const backendPath = path.join(fixtureDir, 'backend.assertions.json');
  if (existsSync(backendPath)) {
    const spec = JSON.parse(readFileSync(backendPath, 'utf8'));
    for (const [key, fileSpec] of Object.entries(spec.files ?? {})) {
      runFileAssertions(fileSpec.path, fileSpec, `${entity}/${key}`);
    }
  }

  const frontendPath = path.join(fixtureDir, 'frontend.assertions.json');
  if (existsSync(frontendPath)) {
    const spec = JSON.parse(readFileSync(frontendPath, 'utf8'));
    for (const [key, fileSpec] of Object.entries(spec.files ?? {})) {
      runFileAssertions(fileSpec.path, fileSpec, `${entity}/${key}`);
    }
  }
}

// ---------------------------------------------------------------------------
// Main
// ---------------------------------------------------------------------------

const fixtureDirs = readdirSync(fixturesDir, { withFileTypes: true })
  .filter((d) => d.isDirectory())
  .map((d) => path.join(fixturesDir, d.name));

for (const dir of fixtureDirs) {
  runFixture(dir);
}

// ---------------------------------------------------------------------------
// Report
// ---------------------------------------------------------------------------

console.log('');
console.log('══════════════════════════════════════════════');
console.log('  KIS-TOiR Eval Report');
console.log('══════════════════════════════════════════════');
console.log(`  Fixtures: ${fixtureDirs.length}`);
console.log(`  Checks:   ${totalChecks}`);
console.log(`  Passed:   ${totalChecks - totalFailures}`);
console.log(`  Failed:   ${totalFailures}`);
console.log('══════════════════════════════════════════════');

if (failures.length > 0) {
  console.log('');
  console.log('Failures:');
  for (const f of failures) {
    console.log(`  [${f.result}] ${f.entity} — ${f.file}`);
    console.log(`         ${f.check}: ${f.detail}`);
  }
  console.log('');
  console.log('To fix: update the prompt or re-generate the failing entity, then re-run evals.');
  console.log('To update a fixture (intentional change): edit tools/eval/fixtures/<entity>/*.assertions.json');
  console.log('');
  process.exit(1);
}

console.log('');
console.log('All evals passed.');
console.log('');