This commit is contained in:
MaKarin
2026-04-03 20:54:37 +03:00
commit c89c23fd1d
50 changed files with 6716 additions and 0 deletions

106
tools/eval/README.md Normal file
View File

@@ -0,0 +1,106 @@
# Eval Harness — Rule 6
Fixture-based regression tests for generated artifacts.
## Why this exists
> "Evals are the test suite for your prompts. You would never ship code without tests;
> don't ship prompts without evals." — Anthropic Engineering
The validation gate (`tools/validate-generation.mjs`) checks **existence** and **structural compliance**.
The eval harness checks **semantic correctness**: are the right patterns present in the generated code?
Do the generated files actually follow the rules in `prompts/`?
Together they enforce:
- Gate: "file exists, field names present, auth seams wired"
- Evals: "DTO has class-validator decorators, FK uses ReferenceInput, date uses DateInput, guard is present"
## Usage
```bash
# Run all evals
npm run eval:generation
# Run evals for one entity
node tools/eval/run-evals.mjs --entity equipment
# Verbose output (show each file being checked)
node tools/eval/run-evals.mjs --verbose
```
## Fixture format
Each fixture lives in `tools/eval/fixtures/<entity>/`:
```
fixtures/
equipment/
meta.json ← what this fixture tests
backend.assertions.json ← patterns the NestJS files must satisfy
frontend.assertions.json ← patterns the React Admin files must satisfy
repair-order/
meta.json
backend.assertions.json
frontend.assertions.json
```
### `meta.json`
```json
{
"entity": "Equipment",
"kebab": "equipment",
"resource": "equipment",
"description": "...",
"tests": ["dto-decorator-coverage", "auth-guards", ...]
}
```
### `*.assertions.json`
Each file entry supports:
| Key | Type | Meaning |
|-----|------|---------|
| `path` | string | Relative path from repo root |
| `must_contain` | string[] | Each string must appear as a literal substring |
| `must_not_contain` | string[] | Each string must NOT appear |
| `must_match_regex` | string[] | Each pattern must match (multiline dot-all) |
| `must_not_match_regex` | string[] | Each pattern must NOT match |
| `comment` | string | Human-readable explanation of what is being tested |
## Eval-driven development workflow
This is the critical principle from Anthropic and Google:
1. **Write the failing eval first.** When you change a prompt or add a rule, add an
assertion that captures the new expectation *before* re-generating.
2. **Run evals**: `npm run eval:generation` → see failures.
3. **Re-generate** the affected entity (following the generation workflow in `AGENTS.md`).
4. **Run evals again**: all pass → the change is verified.
5. **Commit both** the updated fixture and the regenerated artifacts together.
A passing eval after a prompt change confirms the LLM followed the new rule.
A failing eval before a prompt change tells you exactly which prior contract was broken.
## Adding a new entity fixture
When adding a new entity to `domain/toir.api.dsl` and generating its backend + frontend:
1. Create `tools/eval/fixtures/<kebab>/meta.json`
2. Create `tools/eval/fixtures/<kebab>/backend.assertions.json` with at minimum:
- controller: `@Controller(...)`, `@UseGuards(`, `JwtAuthGuard`, HTTP methods
- create_dto: `from 'class-validator'`, required fields with `!:`, `@IsString(`, `@IsOptional(`
- update_dto: `from 'class-validator'`, fields with `?:`, `@IsOptional(`
3. Create `tools/eval/fixtures/<kebab>/frontend.assertions.json` with at minimum:
- create: `ReferenceInput` for FK fields, `NumberInput` for numeric, `DateInput` for date, `SelectInput` for enum
- show: `ReferenceField` for FK fields, `DateField` for date
4. Run `npm run eval:generation` to verify the fixture catches real issues.
## Integration with git hooks
The pre-commit hook (installed by `npm run install-hooks`) runs both:
1. `node tools/validate-generation.mjs --artifacts-only` — existence gate
2. `npm run eval:generation` — semantic eval gate
Both must pass before a commit is accepted.

View File

@@ -0,0 +1,79 @@
{
"entity": "Equipment",
"files": {
"controller": {
"path": "server/src/modules/equipment/equipment.controller.ts",
"must_contain": [
"@Controller('equipments')",
"@UseGuards(",
"JwtAuthGuard",
"@Get()",
"@Post()",
"@Get(':id')",
"@Patch(':id')",
"@Delete(':id')"
],
"must_not_contain": [
"@Put(':id')",
"@Post(':id')"
],
"must_match_regex": [
"@Delete\\(':id'\\)[\\s\\S]{0,80}@Roles\\('admin'\\)|@Roles\\('admin'\\)[\\s\\S]{0,80}@Delete\\(':id'\\)"
],
"comment": "Equipment controller must expose the CRUD verbs expected by the DSL-compatible React Admin contract."
},
"service": {
"path": "server/src/modules/equipment/equipment.service.ts",
"must_contain": [
"setListHeaders(response",
"_start",
"_end",
"_sort",
"_order"
],
"must_match_regex": [
"mode.*insensitive|insensitive.*mode",
"status.*in\\b|\\bin\\b.*status"
],
"comment": "Service must translate React Admin list params into Prisma filters and delegate header wiring through the shared helper."
},
"create_dto": {
"path": "server/src/modules/equipment/dto/create-equipment.dto.ts",
"must_contain": [
"from 'class-validator'",
"inventoryNumber!:",
"name!:",
"equipmentType!:",
"periodicityTO!:",
"status!:",
"@IsString(",
"@IsOptional(",
"@IsEnum("
],
"must_not_contain": [
"id?:",
"id!:"
],
"comment": "Required fields use '!' suffix; optional fields use '?' with @IsOptional(); enum fields use @IsEnum(); class-validator must be imported."
},
"update_dto": {
"path": "server/src/modules/equipment/dto/update-equipment.dto.ts",
"must_contain": [
"from 'class-validator'",
"inventoryNumber?:",
"name?:",
"equipmentType?:",
"status?:",
"@IsOptional(",
"@IsString(",
"@IsEnum("
],
"must_not_contain": [
"inventoryNumber!:",
"name!:",
"status!:"
],
"comment": "Update DTO: all fields are optional ('?' suffix with @IsOptional())."
}
}
}

View File

@@ -0,0 +1,57 @@
{
"entity": "Equipment",
"resource": "equipment",
"files": {
"list": {
"path": "client/src/resources/equipment/EquipmentList.tsx",
"must_contain": [
"List",
"FilterButton",
"TextField",
"inventoryNumber"
],
"must_match_regex": [
"SelectArrayInput",
"source=\"status\""
],
"comment": "Equipment list must expose filter UI directly and keep enum filters."
},
"create": {
"path": "client/src/resources/equipment/EquipmentCreate.tsx",
"must_contain": [
"Create",
"SimpleForm",
"SelectInput"
],
"must_match_regex": [
"NumberInput[\\s\\S]{0,300}source=\"totalEngineHours\"|source=\"totalEngineHours\"[\\s\\S]{0,300}NumberInput",
"DateInput[\\s\\S]{0,300}source=\"dateOfInspection\"|source=\"dateOfInspection\"[\\s\\S]{0,300}DateInput",
"SelectInput[\\s\\S]{0,300}source=\"status\"|source=\"status\"[\\s\\S]{0,300}SelectInput"
],
"comment": "Equipment create form must keep type-correct inputs for enum, date, and decimal/number fields."
},
"edit": {
"path": "client/src/resources/equipment/EquipmentEdit.tsx",
"must_contain": [
"Edit",
"SimpleForm",
"SelectInput"
],
"must_match_regex": [
"NumberInput[\\s\\S]{0,300}source=\"totalEngineHours\"|source=\"totalEngineHours\"[\\s\\S]{0,300}NumberInput",
"DateInput[\\s\\S]{0,300}source=\"dateOfInspection\"|source=\"dateOfInspection\"[\\s\\S]{0,300}DateInput"
],
"comment": "Equipment edit form must keep the same type-correctness guarantees as create."
},
"show": {
"path": "client/src/resources/equipment/EquipmentShow.tsx",
"must_contain": [
"Show",
"SimpleShowLayout",
"TextField",
"inventoryNumber"
],
"comment": "Show must display key fields including inventoryNumber."
}
}
}

View File

@@ -0,0 +1,15 @@
{
"entity": "Equipment",
"kebab": "equipment",
"resource": "equipment",
"description": "Standard entity: UUID primary key, multiple enum fields, decimal fields, date fields, no FK reference to other entities",
"tests": [
"dto-decorator-coverage",
"auth-guards-per-http-method",
"content-range-header-pattern",
"enum-filter-in-operator",
"q-filter-contains-pattern",
"react-admin-component-types",
"class-validator-import"
]
}

View File

@@ -0,0 +1,62 @@
{
"entity": "CategoryResource",
"files": {
"controller": {
"path": "server/src/modules/category-resource/category-resource.controller.ts",
"must_contain": [
"@Controller('category-resources')",
"@UseGuards(",
"JwtAuthGuard",
"@Get()",
"@Post()",
"@Get(':id')",
"@Patch(':id')",
"@Delete(':id')"
],
"must_not_contain": [
"@Put(':id')"
],
"must_match_regex": [
"@Delete\\(':id'\\)[\\s\\S]{0,120}@Roles\\('admin'\\)|@Roles\\('admin'\\)[\\s\\S]{0,120}@Delete\\(':id'\\)"
]
},
"service": {
"path": "server/src/modules/category-resource/category-resource.service.ts",
"must_contain": [
"setListHeaders",
"_start",
"_end",
"partId",
"employeeCode"
],
"must_match_regex": [
"part:\\s*\\{\\s*is:\\s*\\{\\s*name",
"employee:\\s*\\{\\s*is:\\s*\\{\\s*fullName"
]
},
"create_dto": {
"path": "server/src/modules/category-resource/dto/create-category-resource.dto.ts",
"must_contain": [
"from 'class-validator'",
"partId?:",
"employeeCode?:",
"@IsUUID(",
"@IsString(",
"@IsOptional("
],
"must_not_contain": [
"id?:",
"id!:"
]
},
"update_dto": {
"path": "server/src/modules/category-resource/dto/update-category-resource.dto.ts",
"must_contain": [
"from 'class-validator'",
"@IsOptional(",
"partId?:",
"employeeCode?:"
]
}
}
}

View File

@@ -0,0 +1,53 @@
{
"entity": "CategoryResource",
"resource": "category-resources",
"files": {
"list": {
"path": "client/src/resources/category-resource/CategoryResourceList.tsx",
"must_contain": [
"List",
"FilterButton",
"ReferenceField"
],
"must_match_regex": [
"ReferenceField[\\s\\S]{0,200}reference=\"parts\"|reference=\"parts\"[\\s\\S]{0,200}ReferenceField",
"ReferenceField[\\s\\S]{0,200}reference=\"employees\"|reference=\"employees\"[\\s\\S]{0,200}ReferenceField"
]
},
"create": {
"path": "client/src/resources/category-resource/CategoryResourceCreate.tsx",
"must_contain": [
"Create",
"SimpleForm"
],
"must_match_regex": [
"ReferenceInput[\\s\\S]{0,200}reference=\"parts\"|reference=\"parts\"[\\s\\S]{0,200}ReferenceInput",
"ReferenceInput[\\s\\S]{0,200}reference=\"employees\"|reference=\"employees\"[\\s\\S]{0,200}ReferenceInput",
"AutocompleteInput[\\s\\S]{0,200}filterToQuery|filterToQuery[\\s\\S]{0,200}AutocompleteInput"
]
},
"edit": {
"path": "client/src/resources/category-resource/CategoryResourceEdit.tsx",
"must_contain": [
"Edit",
"SimpleForm"
],
"must_match_regex": [
"ReferenceInput[\\s\\S]{0,200}reference=\"parts\"|reference=\"parts\"[\\s\\S]{0,200}ReferenceInput",
"ReferenceInput[\\s\\S]{0,200}reference=\"employees\"|reference=\"employees\"[\\s\\S]{0,200}ReferenceInput"
]
},
"show": {
"path": "client/src/resources/category-resource/CategoryResourceShow.tsx",
"must_contain": [
"Show",
"SimpleShowLayout",
"ReferenceField"
],
"must_match_regex": [
"ReferenceField[\\s\\S]{0,200}reference=\"parts\"|reference=\"parts\"[\\s\\S]{0,200}ReferenceField",
"ReferenceField[\\s\\S]{0,200}reference=\"employees\"|reference=\"employees\"[\\s\\S]{0,200}ReferenceField"
]
}
}
}

View File

@@ -0,0 +1,13 @@
{
"entity": "CategoryResource",
"kebab": "category-resource",
"resource": "category-resources",
"description": "Current FK-heavy entity: UUID PK with references to Part and Employee. Tests reference wiring, autocomplete filters, and protected CRUD routes.",
"tests": [
"dto-decorator-coverage",
"auth-guards",
"fk-reference-input",
"fk-reference-field",
"content-range-header"
]
}

184
tools/eval/run-evals.mjs Normal file
View File

@@ -0,0 +1,184 @@
#!/usr/bin/env node
/**
* tools/eval/run-evals.mjs
*
* Rule 6 — Eval harness: fixture-based regression tests for generated artifacts.
*
* Philosophy:
* - Evals are the test suite for prompts. Never ship a prompt change without
* running evals first.
* - Use deterministic pattern/regex checks ("reference-free" grading) rather
* than golden snapshot comparison. Patterns are maintainable; snapshots are
* brittle.
* - Eval-driven development: write a failing eval FIRST, then update the prompt
* or re-generate to make it pass.
*
* Usage:
* node tools/eval/run-evals.mjs # run all fixtures
* node tools/eval/run-evals.mjs --entity equipment
* node tools/eval/run-evals.mjs --verbose
*/
import { existsSync, readFileSync, readdirSync } from 'node:fs';
import path from 'node:path';
import { fileURLToPath } from 'node:url';
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const rootDir = path.resolve(__dirname, '../..');
const fixturesDir = path.join(__dirname, 'fixtures');
const args = new Set(process.argv.slice(2));
const verbose = args.has('--verbose') || args.has('-v');
const entityFilter = (() => {
const idx = process.argv.indexOf('--entity');
return idx !== -1 ? process.argv[idx + 1] : null;
})();
// ---------------------------------------------------------------------------
// Assertion engine
// ---------------------------------------------------------------------------
let totalChecks = 0;
let totalFailures = 0;
const failures = [];
function readArtifact(relativePath) {
const filePath = path.join(rootDir, relativePath);
if (!existsSync(filePath)) return null;
return readFileSync(filePath, 'utf8');
}
function runFileAssertions(filePath, fileSpec, entityLabel) {
const content = readArtifact(filePath);
if (content === null) {
totalChecks++;
totalFailures++;
failures.push({ entity: entityLabel, file: filePath, check: 'file-exists', result: 'FAIL', detail: `File not found: ${filePath}` });
return;
}
if (verbose) {
console.log(` [${entityLabel}] Checking ${filePath}`);
}
for (const expected of fileSpec.must_contain ?? []) {
totalChecks++;
if (!content.includes(expected)) {
totalFailures++;
failures.push({ entity: entityLabel, file: filePath, check: 'must_contain', result: 'FAIL', detail: `Missing: ${expected}` });
}
}
for (const forbidden of fileSpec.must_not_contain ?? []) {
totalChecks++;
if (content.includes(forbidden)) {
totalFailures++;
failures.push({ entity: entityLabel, file: filePath, check: 'must_not_contain', result: 'FAIL', detail: `Forbidden pattern found: ${forbidden}` });
}
}
for (const patternStr of fileSpec.must_match_regex ?? []) {
totalChecks++;
try {
const re = new RegExp(patternStr);
if (!re.test(content)) {
totalFailures++;
failures.push({ entity: entityLabel, file: filePath, check: 'must_match_regex', result: 'FAIL', detail: `Regex not matched: ${patternStr}` });
}
} catch (e) {
totalFailures++;
failures.push({ entity: entityLabel, file: filePath, check: 'must_match_regex', result: 'ERROR', detail: `Bad regex: ${patternStr}${e.message}` });
}
}
for (const patternStr of fileSpec.must_not_match_regex ?? []) {
totalChecks++;
try {
const re = new RegExp(patternStr);
if (re.test(content)) {
totalFailures++;
failures.push({ entity: entityLabel, file: filePath, check: 'must_not_match_regex', result: 'FAIL', detail: `Forbidden regex matched: ${patternStr}` });
}
} catch (e) {
totalFailures++;
failures.push({ entity: entityLabel, file: filePath, check: 'must_not_match_regex', result: 'ERROR', detail: `Bad regex: ${patternStr}${e.message}` });
}
}
}
function runFixture(fixtureDir) {
const metaPath = path.join(fixtureDir, 'meta.json');
if (!existsSync(metaPath)) return;
const meta = JSON.parse(readFileSync(metaPath, 'utf8'));
const { entity, kebab } = meta;
if (entityFilter && kebab !== entityFilter && entity.toLowerCase() !== entityFilter.toLowerCase()) {
return;
}
if (verbose) {
console.log(`\n[EVAL] ${entity}${meta.description ?? ''}`);
}
const backendPath = path.join(fixtureDir, 'backend.assertions.json');
if (existsSync(backendPath)) {
const spec = JSON.parse(readFileSync(backendPath, 'utf8'));
for (const [key, fileSpec] of Object.entries(spec.files ?? {})) {
runFileAssertions(fileSpec.path, fileSpec, `${entity}/${key}`);
}
}
const frontendPath = path.join(fixtureDir, 'frontend.assertions.json');
if (existsSync(frontendPath)) {
const spec = JSON.parse(readFileSync(frontendPath, 'utf8'));
for (const [key, fileSpec] of Object.entries(spec.files ?? {})) {
runFileAssertions(fileSpec.path, fileSpec, `${entity}/${key}`);
}
}
}
// ---------------------------------------------------------------------------
// Main
// ---------------------------------------------------------------------------
const fixtureDirs = readdirSync(fixturesDir, { withFileTypes: true })
.filter((d) => d.isDirectory())
.map((d) => path.join(fixturesDir, d.name));
for (const dir of fixtureDirs) {
runFixture(dir);
}
// ---------------------------------------------------------------------------
// Report
// ---------------------------------------------------------------------------
console.log('');
console.log('══════════════════════════════════════════════');
console.log(' KIS-TOiR Eval Report');
console.log('══════════════════════════════════════════════');
console.log(` Fixtures: ${fixtureDirs.length}`);
console.log(` Checks: ${totalChecks}`);
console.log(` Passed: ${totalChecks - totalFailures}`);
console.log(` Failed: ${totalFailures}`);
console.log('══════════════════════════════════════════════');
if (failures.length > 0) {
console.log('');
console.log('Failures:');
for (const f of failures) {
console.log(` [${f.result}] ${f.entity}${f.file}`);
console.log(` ${f.check}: ${f.detail}`);
}
console.log('');
console.log('To fix: update the prompt or re-generate the failing entity, then re-run evals.');
console.log('To update a fixture (intentional change): edit tools/eval/fixtures/<entity>/*.assertions.json');
console.log('');
process.exit(1);
}
console.log('');
console.log('All evals passed.');
console.log('');