(llm-first): context budget, validation, and eval harness, orchestration general-prompt

2026-04-03 14:17:21 +03:00
parent 79c9589658
commit c42a88dff6
189 changed files with 15538 additions and 9109 deletions
--- a/tools/eval/README.md
+++ b/tools/eval/README.md
@@ -0,0 +1,106 @@
+# Eval Harness — Rule 6
+
+Fixture-based regression tests for generated artifacts.
+
+## Why this exists
+
+> "Evals are the test suite for your prompts. You would never ship code without tests;
+> don't ship prompts without evals." — Anthropic Engineering
+
+The validation gate (`tools/validate-generation.mjs`) checks **existence** and **structural compliance**.
+The eval harness checks **semantic correctness**: are the right patterns present in the generated code?
+Do the generated files actually follow the rules in `prompts/`?
+
+Together they enforce:
+- Gate: "file exists, field names present, auth seams wired"
+- Evals: "DTO has class-validator decorators, FK uses ReferenceInput, date uses DateInput, guard is present"
+
+## Usage
+
+```bash
+# Run all evals
+npm run eval:generation
+
+# Run evals for one entity
+node tools/eval/run-evals.mjs --entity equipment
+
+# Verbose output (show each file being checked)
+node tools/eval/run-evals.mjs --verbose
+```
+
+## Fixture format
+
+Each fixture lives in `tools/eval/fixtures/<entity>/`:
+
+```
+fixtures/
+  equipment/
+    meta.json                  ← what this fixture tests
+    backend.assertions.json    ← patterns the NestJS files must satisfy
+    frontend.assertions.json   ← patterns the React Admin files must satisfy
+  repair-order/
+    meta.json
+    backend.assertions.json
+    frontend.assertions.json
+```
+
+### `meta.json`
+
+```json
+{
+  "entity": "Equipment",
+  "kebab": "equipment",
+  "resource": "equipment",
+  "description": "...",
+  "tests": ["dto-decorator-coverage", "auth-guards", ...]
+}
+```
+
+### `*.assertions.json`
+
+Each file entry supports:
+
+| Key | Type | Meaning |
+|-----|------|---------|
+| `path` | string | Relative path from repo root |
+| `must_contain` | string[] | Each string must appear as a literal substring |
+| `must_not_contain` | string[] | Each string must NOT appear |
+| `must_match_regex` | string[] | Each pattern must match (multiline dot-all) |
+| `must_not_match_regex` | string[] | Each pattern must NOT match |
+| `comment` | string | Human-readable explanation of what is being tested |
+
+## Eval-driven development workflow
+
+This is the critical principle from Anthropic and Google:
+
+1. **Write the failing eval first.** When you change a prompt or add a rule, add an
+   assertion that captures the new expectation *before* re-generating.
+2. **Run evals**: `npm run eval:generation` → see failures.
+3. **Re-generate** the affected entity (following the generation workflow in `AGENTS.md`).
+4. **Run evals again**: all pass → the change is verified.
+5. **Commit both** the updated fixture and the regenerated artifacts together.
+
+A passing eval after a prompt change confirms the LLM followed the new rule.
+A failing eval before a prompt change tells you exactly which prior contract was broken.
+
+## Adding a new entity fixture
+
+When adding a new entity to `domain/toir.api.dsl` and generating its backend + frontend:
+
+1. Create `tools/eval/fixtures/<kebab>/meta.json`
+2. Create `tools/eval/fixtures/<kebab>/backend.assertions.json` with at minimum:
+   - controller: `@Controller(...)`, `@UseGuards(`, `JwtAuthGuard`, HTTP methods
+   - create_dto: `from 'class-validator'`, required fields with `!:`, `@IsString(`, `@IsOptional(`
+   - update_dto: `from 'class-validator'`, fields with `?:`, `@IsOptional(`
+3. Create `tools/eval/fixtures/<kebab>/frontend.assertions.json` with at minimum:
+   - create: `ReferenceInput` for FK fields, `NumberInput` for numeric, `DateInput` for date, `SelectInput` for enum
+   - show: `ReferenceField` for FK fields, `DateField` for date
+4. Run `npm run eval:generation` to verify the fixture catches real issues.
+
+## Integration with git hooks
+
+The pre-commit hook (installed by `npm run install-hooks`) runs both:
+1. `node tools/validate-generation.mjs --artifacts-only` — existence gate
+2. `npm run eval:generation` — semantic eval gate
+
+Both must pass before a commit is accepted.
--- a/tools/eval/fixtures/equipment/backend.assertions.json
+++ b/tools/eval/fixtures/equipment/backend.assertions.json
@@ -0,0 +1,79 @@
+{
+  "entity": "Equipment",
+  "files": {
+    "controller": {
+      "path": "server/src/modules/equipment/equipment.controller.ts",
+      "must_contain": [
+        "@Controller('equipments')",
+        "@UseGuards(",
+        "JwtAuthGuard",
+        "@Get()",
+        "@Post()",
+        "@Get(':id')",
+        "@Patch(':id')",
+        "@Delete(':id')"
+      ],
+      "must_not_contain": [
+        "@Put(':id')",
+        "@Post(':id')"
+      ],
+      "must_match_regex": [
+        "@Delete\\(':id'\\)[\\s\\S]{0,80}@Roles\\('admin'\\)|@Roles\\('admin'\\)[\\s\\S]{0,80}@Delete\\(':id'\\)"
+      ],
+      "comment": "Equipment controller must expose the CRUD verbs expected by the DSL-compatible React Admin contract."
+    },
+    "service": {
+      "path": "server/src/modules/equipment/equipment.service.ts",
+      "must_contain": [
+        "setListHeaders(response",
+        "_start",
+        "_end",
+        "_sort",
+        "_order"
+      ],
+      "must_match_regex": [
+        "mode.*insensitive|insensitive.*mode",
+        "status.*in\\b|\\bin\\b.*status"
+      ],
+      "comment": "Service must translate React Admin list params into Prisma filters and delegate header wiring through the shared helper."
+    },
+    "create_dto": {
+      "path": "server/src/modules/equipment/dto/create-equipment.dto.ts",
+      "must_contain": [
+        "from 'class-validator'",
+        "inventoryNumber!:",
+        "name!:",
+        "equipmentType!:",
+        "periodicityTO!:",
+        "status!:",
+        "@IsString(",
+        "@IsOptional(",
+        "@IsEnum("
+      ],
+      "must_not_contain": [
+        "id?:",
+        "id!:"
+      ],
+      "comment": "Required fields use '!' suffix; optional fields use '?' with @IsOptional(); enum fields use @IsEnum(); class-validator must be imported."
+    },
+    "update_dto": {
+      "path": "server/src/modules/equipment/dto/update-equipment.dto.ts",
+      "must_contain": [
+        "from 'class-validator'",
+        "inventoryNumber?:",
+        "name?:",
+        "equipmentType?:",
+        "status?:",
+        "@IsOptional(",
+        "@IsString(",
+        "@IsEnum("
+      ],
+      "must_not_contain": [
+        "inventoryNumber!:",
+        "name!:",
+        "status!:"
+      ],
+      "comment": "Update DTO: all fields are optional ('?' suffix with @IsOptional())."
+    }
+  }
+}
--- a/tools/eval/fixtures/equipment/frontend.assertions.json
+++ b/tools/eval/fixtures/equipment/frontend.assertions.json
@@ -0,0 +1,57 @@
+{
+  "entity": "Equipment",
+  "resource": "equipment",
+  "files": {
+    "list": {
+      "path": "client/src/resources/equipment/EquipmentList.tsx",
+      "must_contain": [
+        "List",
+        "FilterButton",
+        "TextField",
+        "inventoryNumber"
+      ],
+      "must_match_regex": [
+        "SelectArrayInput",
+        "source=\"status\""
+      ],
+      "comment": "Equipment list must expose filter UI directly and keep enum filters."
+    },
+    "create": {
+      "path": "client/src/resources/equipment/EquipmentCreate.tsx",
+      "must_contain": [
+        "Create",
+        "SimpleForm",
+        "SelectInput"
+      ],
+      "must_match_regex": [
+        "NumberInput[\\s\\S]{0,300}source=\"totalEngineHours\"|source=\"totalEngineHours\"[\\s\\S]{0,300}NumberInput",
+        "DateInput[\\s\\S]{0,300}source=\"dateOfInspection\"|source=\"dateOfInspection\"[\\s\\S]{0,300}DateInput",
+        "SelectInput[\\s\\S]{0,300}source=\"status\"|source=\"status\"[\\s\\S]{0,300}SelectInput"
+      ],
+      "comment": "Equipment create form must keep type-correct inputs for enum, date, and decimal/number fields."
+    },
+    "edit": {
+      "path": "client/src/resources/equipment/EquipmentEdit.tsx",
+      "must_contain": [
+        "Edit",
+        "SimpleForm",
+        "SelectInput"
+      ],
+      "must_match_regex": [
+        "NumberInput[\\s\\S]{0,300}source=\"totalEngineHours\"|source=\"totalEngineHours\"[\\s\\S]{0,300}NumberInput",
+        "DateInput[\\s\\S]{0,300}source=\"dateOfInspection\"|source=\"dateOfInspection\"[\\s\\S]{0,300}DateInput"
+      ],
+      "comment": "Equipment edit form must keep the same type-correctness guarantees as create."
+    },
+    "show": {
+      "path": "client/src/resources/equipment/EquipmentShow.tsx",
+      "must_contain": [
+        "Show",
+        "SimpleShowLayout",
+        "TextField",
+        "inventoryNumber"
+      ],
+      "comment": "Show must display key fields including inventoryNumber."
+    }
+  }
+}
--- a/tools/eval/fixtures/equipment/meta.json
+++ b/tools/eval/fixtures/equipment/meta.json
@@ -0,0 +1,15 @@
+{
+  "entity": "Equipment",
+  "kebab": "equipment",
+  "resource": "equipment",
+  "description": "Standard entity: UUID primary key, multiple enum fields, decimal fields, date fields, no FK reference to other entities",
+  "tests": [
+    "dto-decorator-coverage",
+    "auth-guards-per-http-method",
+    "content-range-header-pattern",
+    "enum-filter-in-operator",
+    "q-filter-contains-pattern",
+    "react-admin-component-types",
+    "class-validator-import"
+  ]
+}
--- a/tools/eval/fixtures/repair-order/backend.assertions.json
+++ b/tools/eval/fixtures/repair-order/backend.assertions.json
@@ -0,0 +1,62 @@
+{
+  "entity": "CategoryResource",
+  "files": {
+    "controller": {
+      "path": "server/src/modules/category-resource/category-resource.controller.ts",
+      "must_contain": [
+        "@Controller('category-resources')",
+        "@UseGuards(",
+        "JwtAuthGuard",
+        "@Get()",
+        "@Post()",
+        "@Get(':id')",
+        "@Patch(':id')",
+        "@Delete(':id')"
+      ],
+      "must_not_contain": [
+        "@Put(':id')"
+      ],
+      "must_match_regex": [
+        "@Delete\\(':id'\\)[\\s\\S]{0,120}@Roles\\('admin'\\)|@Roles\\('admin'\\)[\\s\\S]{0,120}@Delete\\(':id'\\)"
+      ]
+    },
+    "service": {
+      "path": "server/src/modules/category-resource/category-resource.service.ts",
+      "must_contain": [
+        "setListHeaders",
+        "_start",
+        "_end",
+        "partId",
+        "employeeCode"
+      ],
+      "must_match_regex": [
+        "part:\\s*\\{\\s*is:\\s*\\{\\s*name",
+        "employee:\\s*\\{\\s*is:\\s*\\{\\s*fullName"
+      ]
+    },
+    "create_dto": {
+      "path": "server/src/modules/category-resource/dto/create-category-resource.dto.ts",
+      "must_contain": [
+        "from 'class-validator'",
+        "partId?:",
+        "employeeCode?:",
+        "@IsUUID(",
+        "@IsString(",
+        "@IsOptional("
+      ],
+      "must_not_contain": [
+        "id?:",
+        "id!:"
+      ]
+    },
+    "update_dto": {
+      "path": "server/src/modules/category-resource/dto/update-category-resource.dto.ts",
+      "must_contain": [
+        "from 'class-validator'",
+        "@IsOptional(",
+        "partId?:",
+        "employeeCode?:"
+      ]
+    }
+  }
+}
--- a/tools/eval/fixtures/repair-order/frontend.assertions.json
+++ b/tools/eval/fixtures/repair-order/frontend.assertions.json
@@ -0,0 +1,53 @@
+{
+  "entity": "CategoryResource",
+  "resource": "category-resources",
+  "files": {
+    "list": {
+      "path": "client/src/resources/category-resource/CategoryResourceList.tsx",
+      "must_contain": [
+        "List",
+        "FilterButton",
+        "ReferenceField"
+      ],
+      "must_match_regex": [
+        "ReferenceField[\\s\\S]{0,200}reference=\"parts\"|reference=\"parts\"[\\s\\S]{0,200}ReferenceField",
+        "ReferenceField[\\s\\S]{0,200}reference=\"employees\"|reference=\"employees\"[\\s\\S]{0,200}ReferenceField"
+      ]
+    },
+    "create": {
+      "path": "client/src/resources/category-resource/CategoryResourceCreate.tsx",
+      "must_contain": [
+        "Create",
+        "SimpleForm"
+      ],
+      "must_match_regex": [
+        "ReferenceInput[\\s\\S]{0,200}reference=\"parts\"|reference=\"parts\"[\\s\\S]{0,200}ReferenceInput",
+        "ReferenceInput[\\s\\S]{0,200}reference=\"employees\"|reference=\"employees\"[\\s\\S]{0,200}ReferenceInput",
+        "AutocompleteInput[\\s\\S]{0,200}filterToQuery|filterToQuery[\\s\\S]{0,200}AutocompleteInput"
+      ]
+    },
+    "edit": {
+      "path": "client/src/resources/category-resource/CategoryResourceEdit.tsx",
+      "must_contain": [
+        "Edit",
+        "SimpleForm"
+      ],
+      "must_match_regex": [
+        "ReferenceInput[\\s\\S]{0,200}reference=\"parts\"|reference=\"parts\"[\\s\\S]{0,200}ReferenceInput",
+        "ReferenceInput[\\s\\S]{0,200}reference=\"employees\"|reference=\"employees\"[\\s\\S]{0,200}ReferenceInput"
+      ]
+    },
+    "show": {
+      "path": "client/src/resources/category-resource/CategoryResourceShow.tsx",
+      "must_contain": [
+        "Show",
+        "SimpleShowLayout",
+        "ReferenceField"
+      ],
+      "must_match_regex": [
+        "ReferenceField[\\s\\S]{0,200}reference=\"parts\"|reference=\"parts\"[\\s\\S]{0,200}ReferenceField",
+        "ReferenceField[\\s\\S]{0,200}reference=\"employees\"|reference=\"employees\"[\\s\\S]{0,200}ReferenceField"
+      ]
+    }
+  }
+}
--- a/tools/eval/fixtures/repair-order/meta.json
+++ b/tools/eval/fixtures/repair-order/meta.json
@@ -0,0 +1,13 @@
+{
+  "entity": "CategoryResource",
+  "kebab": "category-resource",
+  "resource": "category-resources",
+  "description": "Current FK-heavy entity: UUID PK with references to Part and Employee. Tests reference wiring, autocomplete filters, and protected CRUD routes.",
+  "tests": [
+    "dto-decorator-coverage",
+    "auth-guards",
+    "fk-reference-input",
+    "fk-reference-field",
+    "content-range-header"
+  ]
+}
--- a/tools/eval/run-evals.mjs
+++ b/tools/eval/run-evals.mjs
@@ -0,0 +1,184 @@
+#!/usr/bin/env node
+/**
+ * tools/eval/run-evals.mjs
+ *
+ * Rule 6 — Eval harness: fixture-based regression tests for generated artifacts.
+ *
+ * Philosophy:
+ *   - Evals are the test suite for prompts. Never ship a prompt change without
+ *     running evals first.
+ *   - Use deterministic pattern/regex checks ("reference-free" grading) rather
+ *     than golden snapshot comparison. Patterns are maintainable; snapshots are
+ *     brittle.
+ *   - Eval-driven development: write a failing eval FIRST, then update the prompt
+ *     or re-generate to make it pass.
+ *
+ * Usage:
+ *   node tools/eval/run-evals.mjs              # run all fixtures
+ *   node tools/eval/run-evals.mjs --entity equipment
+ *   node tools/eval/run-evals.mjs --verbose
+ */
+
+import { existsSync, readFileSync, readdirSync } from 'node:fs';
+import path from 'node:path';
+import { fileURLToPath } from 'node:url';
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+const rootDir = path.resolve(__dirname, '../..');
+const fixturesDir = path.join(__dirname, 'fixtures');
+
+const args = new Set(process.argv.slice(2));
+const verbose = args.has('--verbose') || args.has('-v');
+const entityFilter = (() => {
+  const idx = process.argv.indexOf('--entity');
+  return idx !== -1 ? process.argv[idx + 1] : null;
+})();
+
+// ---------------------------------------------------------------------------
+// Assertion engine
+// ---------------------------------------------------------------------------
+
+let totalChecks = 0;
+let totalFailures = 0;
+const failures = [];
+
+function readArtifact(relativePath) {
+  const filePath = path.join(rootDir, relativePath);
+  if (!existsSync(filePath)) return null;
+  return readFileSync(filePath, 'utf8');
+}
+
+function runFileAssertions(filePath, fileSpec, entityLabel) {
+  const content = readArtifact(filePath);
+
+  if (content === null) {
+    totalChecks++;
+    totalFailures++;
+    failures.push({ entity: entityLabel, file: filePath, check: 'file-exists', result: 'FAIL', detail: `File not found: ${filePath}` });
+    return;
+  }
+
+  if (verbose) {
+    console.log(`  [${entityLabel}] Checking ${filePath}`);
+  }
+
+  for (const expected of fileSpec.must_contain ?? []) {
+    totalChecks++;
+    if (!content.includes(expected)) {
+      totalFailures++;
+      failures.push({ entity: entityLabel, file: filePath, check: 'must_contain', result: 'FAIL', detail: `Missing: ${expected}` });
+    }
+  }
+
+  for (const forbidden of fileSpec.must_not_contain ?? []) {
+    totalChecks++;
+    if (content.includes(forbidden)) {
+      totalFailures++;
+      failures.push({ entity: entityLabel, file: filePath, check: 'must_not_contain', result: 'FAIL', detail: `Forbidden pattern found: ${forbidden}` });
+    }
+  }
+
+  for (const patternStr of fileSpec.must_match_regex ?? []) {
+    totalChecks++;
+    try {
+      const re = new RegExp(patternStr);
+      if (!re.test(content)) {
+        totalFailures++;
+        failures.push({ entity: entityLabel, file: filePath, check: 'must_match_regex', result: 'FAIL', detail: `Regex not matched: ${patternStr}` });
+      }
+    } catch (e) {
+      totalFailures++;
+      failures.push({ entity: entityLabel, file: filePath, check: 'must_match_regex', result: 'ERROR', detail: `Bad regex: ${patternStr} — ${e.message}` });
+    }
+  }
+
+  for (const patternStr of fileSpec.must_not_match_regex ?? []) {
+    totalChecks++;
+    try {
+      const re = new RegExp(patternStr);
+      if (re.test(content)) {
+        totalFailures++;
+        failures.push({ entity: entityLabel, file: filePath, check: 'must_not_match_regex', result: 'FAIL', detail: `Forbidden regex matched: ${patternStr}` });
+      }
+    } catch (e) {
+      totalFailures++;
+      failures.push({ entity: entityLabel, file: filePath, check: 'must_not_match_regex', result: 'ERROR', detail: `Bad regex: ${patternStr} — ${e.message}` });
+    }
+  }
+}
+
+function runFixture(fixtureDir) {
+  const metaPath = path.join(fixtureDir, 'meta.json');
+  if (!existsSync(metaPath)) return;
+
+  const meta = JSON.parse(readFileSync(metaPath, 'utf8'));
+  const { entity, kebab } = meta;
+
+  if (entityFilter && kebab !== entityFilter && entity.toLowerCase() !== entityFilter.toLowerCase()) {
+    return;
+  }
+
+  if (verbose) {
+    console.log(`\n[EVAL] ${entity} — ${meta.description ?? ''}`);
+  }
+
+  const backendPath = path.join(fixtureDir, 'backend.assertions.json');
+  if (existsSync(backendPath)) {
+    const spec = JSON.parse(readFileSync(backendPath, 'utf8'));
+    for (const [key, fileSpec] of Object.entries(spec.files ?? {})) {
+      runFileAssertions(fileSpec.path, fileSpec, `${entity}/${key}`);
+    }
+  }
+
+  const frontendPath = path.join(fixtureDir, 'frontend.assertions.json');
+  if (existsSync(frontendPath)) {
+    const spec = JSON.parse(readFileSync(frontendPath, 'utf8'));
+    for (const [key, fileSpec] of Object.entries(spec.files ?? {})) {
+      runFileAssertions(fileSpec.path, fileSpec, `${entity}/${key}`);
+    }
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Main
+// ---------------------------------------------------------------------------
+
+const fixtureDirs = readdirSync(fixturesDir, { withFileTypes: true })
+  .filter((d) => d.isDirectory())
+  .map((d) => path.join(fixturesDir, d.name));
+
+for (const dir of fixtureDirs) {
+  runFixture(dir);
+}
+
+// ---------------------------------------------------------------------------
+// Report
+// ---------------------------------------------------------------------------
+
+console.log('');
+console.log('══════════════════════════════════════════════');
+console.log('  KIS-TOiR Eval Report');
+console.log('══════════════════════════════════════════════');
+console.log(`  Fixtures: ${fixtureDirs.length}`);
+console.log(`  Checks:   ${totalChecks}`);
+console.log(`  Passed:   ${totalChecks - totalFailures}`);
+console.log(`  Failed:   ${totalFailures}`);
+console.log('══════════════════════════════════════════════');
+
+if (failures.length > 0) {
+  console.log('');
+  console.log('Failures:');
+  for (const f of failures) {
+    console.log(`  [${f.result}] ${f.entity} — ${f.file}`);
+    console.log(`         ${f.check}: ${f.detail}`);
+  }
+  console.log('');
+  console.log('To fix: update the prompt or re-generate the failing entity, then re-run evals.');
+  console.log('To update a fixture (intentional change): edit tools/eval/fixtures/<entity>/*.assertions.json');
+  console.log('');
+  process.exit(1);
+}
+
+console.log('');
+console.log('All evals passed.');
+console.log('');