Files
autogame-17_capability-evolver/test/bench.test.js

350 lines
15 KiB
JavaScript
Raw Normal View History

'use strict';
const { describe, it, beforeEach, afterEach } = require('node:test');
const assert = require('node:assert/strict');
const fs = require('fs');
const path = require('path');
const os = require('os');
// ---------------------------------------------------------------------------
// evolver-bench: quantitative benchmark for evolution effectiveness
// Measures: gene selection accuracy, failure distillation quality,
// signal extraction recall, anti-pattern avoidance
// ---------------------------------------------------------------------------
const { selectGene, selectGeneAndCapsule } = require('../src/gep/selector');
const { extractSignals } = require('../src/gep/signals');
const {
collectFailureDistillationData,
analyzeFailurePatterns,
synthesizeRepairGeneFromFailures,
autoDistillFromFailures,
validateSynthesizedGene,
shouldDistillFromFailures,
REPAIR_DISTILLED_ID_PREFIX,
FAILURE_DISTILLER_MIN_CAPSULES,
} = require('../src/gep/skillDistiller');
let tmpDir;
let savedEnv = {};
function setupTempEnv() {
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evolver-bench-'));
savedEnv = {
GEP_ASSETS_DIR: process.env.GEP_ASSETS_DIR,
EVOLUTION_DIR: process.env.EVOLUTION_DIR,
MEMORY_DIR: process.env.MEMORY_DIR,
MEMORY_GRAPH_PATH: process.env.MEMORY_GRAPH_PATH,
SKILL_DISTILLER: process.env.SKILL_DISTILLER,
FAILURE_DISTILLER: process.env.FAILURE_DISTILLER,
};
process.env.GEP_ASSETS_DIR = path.join(tmpDir, 'assets');
process.env.EVOLUTION_DIR = path.join(tmpDir, 'evolution');
process.env.MEMORY_DIR = path.join(tmpDir, 'memory');
process.env.MEMORY_GRAPH_PATH = path.join(tmpDir, 'evolution', 'memory_graph.jsonl');
fs.mkdirSync(process.env.GEP_ASSETS_DIR, { recursive: true });
fs.mkdirSync(process.env.EVOLUTION_DIR, { recursive: true });
fs.mkdirSync(process.env.MEMORY_DIR, { recursive: true });
}
function teardownTempEnv() {
Object.keys(savedEnv).forEach(function (key) {
if (savedEnv[key] !== undefined) process.env[key] = savedEnv[key];
else delete process.env[key];
});
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch (e) {}
}
function writeJson(filePath, data) {
fs.mkdirSync(path.dirname(filePath), { recursive: true });
fs.writeFileSync(filePath, JSON.stringify(data, null, 2), 'utf8');
}
function makeFailedCapsule(id, gene, trigger, failureReason, learningSignals, violations) {
return {
type: 'Capsule',
id: id,
gene: gene,
trigger: trigger || ['error'],
outcome: { status: 'failed', score: 0.2 },
failure_reason: failureReason || 'constraint violation',
learning_signals: learningSignals || [],
constraint_violations: violations || [],
blast_radius: { files: 3, lines: 50 },
created_at: new Date().toISOString(),
};
}
// ---------------------------------------------------------------------------
// Bench 1: Gene Selection Accuracy
// ---------------------------------------------------------------------------
describe('bench: gene selection accuracy', function () {
var BENCH_GENES = [
{ type: 'Gene', id: 'gene_repair', category: 'repair', signals_match: ['error', 'exception', 'failed', 'unstable'], strategy: ['fix'] },
{ type: 'Gene', id: 'gene_optimize', category: 'optimize', signals_match: ['protocol', 'gep', 'prompt', 'audit'], strategy: ['optimize'] },
{ type: 'Gene', id: 'gene_innovate', category: 'innovate', signals_match: ['user_feature_request', 'capability_gap', 'stable_success_plateau'], strategy: ['build'] },
{ type: 'Gene', id: 'gene_perf', category: 'optimize', signals_match: ['perf_bottleneck', 'latency', 'throughput', 'slow'], strategy: ['speed up'] },
];
var TEST_CASES = [
{ signals: ['error', 'exception'], expected: 'gene_repair', label: 'error signals -> repair' },
{ signals: ['protocol', 'audit'], expected: 'gene_optimize', label: 'protocol signals -> optimize' },
{ signals: ['user_feature_request', 'capability_gap'], expected: 'gene_innovate', label: 'feature request -> innovate' },
{ signals: ['perf_bottleneck', 'latency'], expected: 'gene_perf', label: 'perf signals -> perf optimize' },
{ signals: ['failed', 'unstable'], expected: 'gene_repair', label: 'failure signals -> repair' },
{ signals: ['gep', 'prompt'], expected: 'gene_optimize', label: 'prompt signals -> optimize' },
];
it('achieves >= 80% selection accuracy on standard signal scenarios', function () {
var correct = 0;
var total = TEST_CASES.length;
for (var i = 0; i < TEST_CASES.length; i++) {
var tc = TEST_CASES[i];
var result = selectGene(BENCH_GENES, tc.signals, { effectivePopulationSize: 100 });
if (result.selected && result.selected.id === tc.expected) {
correct++;
}
}
var accuracy = correct / total;
assert.ok(accuracy >= 0.8, 'Gene selection accuracy ' + (accuracy * 100).toFixed(1) + '% < 80% threshold');
});
it('never selects a banned gene', function () {
var banned = new Set(['gene_repair']);
for (var i = 0; i < 20; i++) {
var result = selectGene(BENCH_GENES, ['error', 'exception'], {
bannedGeneIds: banned,
effectivePopulationSize: 100,
});
if (result.selected) {
assert.ok(!banned.has(result.selected.id), 'Selected banned gene: ' + result.selected.id);
}
}
});
});
// ---------------------------------------------------------------------------
// Bench 2: Signal Extraction Recall
// ---------------------------------------------------------------------------
describe('bench: signal extraction recall', function () {
it('extracts error signals from log-like input', function () {
var signals = extractSignals({
recentSessionTranscript: '',
todayLog: '[error] Module X failed to load\n[error] Database connection timeout\nException in thread main',
memorySnippet: '',
userSnippet: '',
});
assert.ok(signals.includes('log_error'), 'Should detect log_error from [error] lines');
});
it('extracts feature request signals', function () {
var signals = extractSignals({
recentSessionTranscript: '',
todayLog: '',
memorySnippet: '',
userSnippet: 'I want a dark mode toggle in the settings panel',
});
var hasFeatureSignal = signals.some(function (s) {
return s.indexOf('user_feature_request') !== -1 || s.indexOf('user_improvement_suggestion') !== -1;
});
assert.ok(hasFeatureSignal, 'Should detect feature request from user input');
});
it('detects stagnation signals', function () {
var recentEvents = [];
for (var i = 0; i < 6; i++) {
recentEvents.push({
type: 'EvolutionEvent',
outcome: { status: 'success', score: 0.85 },
blast_radius: { files: 0, lines: 0 },
signals: [],
});
}
var signals = extractSignals({
recentSessionTranscript: '',
todayLog: '',
memorySnippet: '',
userSnippet: '',
recentEvents: recentEvents,
});
var hasStagnation = signals.some(function (s) {
return s.indexOf('empty_cycle') !== -1 || s.indexOf('stagnation') !== -1 || s.indexOf('steady_state') !== -1;
});
assert.ok(hasStagnation || signals.length === 0, 'Stagnation detection assessed (may not trigger with minimal events)');
});
});
// ---------------------------------------------------------------------------
// Bench 3: Failure Distillation Quality
// ---------------------------------------------------------------------------
describe('bench: failure distillation quality', function () {
beforeEach(setupTempEnv);
afterEach(teardownTempEnv);
it('collects and groups failed capsules correctly', function () {
var failures = [
makeFailedCapsule('f1', 'gene_repair', ['error', 'crash'], 'blast_radius_exceeded', ['problem:reliability'], ['blast_radius_exceeded']),
makeFailedCapsule('f2', 'gene_repair', ['error', 'timeout'], 'blast_radius_exceeded', ['problem:reliability'], ['blast_radius_exceeded']),
makeFailedCapsule('f3', 'gene_optimize', ['protocol'], 'validation_failed', ['problem:protocol'], ['validation_cmd_failed']),
];
writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), { version: 1, failed_capsules: failures });
var data = collectFailureDistillationData();
assert.equal(data.failedCapsules.length, 3);
assert.ok(Object.keys(data.grouped).length >= 2);
});
it('identifies high-frequency failure patterns', function () {
var failures = [];
for (var i = 0; i < 5; i++) {
failures.push(makeFailedCapsule(
'f' + i, 'gene_repair', ['error', 'memory_leak'],
'blast_radius_exceeded: too many files changed',
['problem:reliability', 'risk:validation'],
['blast_radius_exceeded', 'max_files_exceeded']
));
}
writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), { version: 1, failed_capsules: failures });
var data = collectFailureDistillationData();
var analysis = analyzeFailurePatterns(data);
assert.ok(analysis.high_frequency_failures.length >= 1, 'Should detect high-frequency failure pattern');
assert.ok(analysis.high_frequency_failures[0].count >= 2);
});
it('synthesizes a repair gene from failure patterns', function () {
var failures = [];
for (var i = 0; i < 6; i++) {
failures.push(makeFailedCapsule(
'f' + i, 'gene_repair', ['error', 'crash'],
'blast_radius_exceeded',
['problem:reliability'],
['blast_radius_exceeded']
));
}
writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), { version: 1, failed_capsules: failures });
var data = collectFailureDistillationData();
var analysis = analyzeFailurePatterns(data);
var gene = synthesizeRepairGeneFromFailures(data, analysis, []);
assert.ok(gene, 'Should produce a repair gene');
assert.equal(gene.type, 'Gene');
assert.equal(gene.category, 'repair');
assert.ok(gene.id.startsWith(REPAIR_DISTILLED_ID_PREFIX) || gene.id.startsWith('gene_distilled_'), 'Gene id should have repair prefix');
assert.ok(gene.strategy.length >= 4, 'Strategy should have guard steps');
assert.ok(gene.strategy.some(function (s) { return s.indexOf('GUARD') !== -1 || s.indexOf('guard') !== -1; }), 'Should include guard steps');
});
it('autoDistillFromFailures produces a gene when threshold met', function () {
var failures = [];
for (var i = 0; i < 6; i++) {
failures.push(makeFailedCapsule(
'f' + i, 'gene_repair', ['error', 'crash'],
'blast_radius_exceeded',
['problem:reliability'],
['blast_radius_exceeded']
));
}
writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), { version: 1, failed_capsules: failures });
writeJson(path.join(process.env.GEP_ASSETS_DIR, 'genes.json'), { version: 1, genes: [] });
var result = autoDistillFromFailures();
assert.ok(result.ok, 'autoDistillFromFailures should succeed: ' + (result.reason || ''));
assert.ok(result.gene);
assert.equal(result.source, 'failure_distillation');
var genes = JSON.parse(fs.readFileSync(path.join(process.env.GEP_ASSETS_DIR, 'genes.json'), 'utf8'));
assert.ok(genes.genes.some(function (g) { return g.id === result.gene.id; }), 'Gene should be persisted');
});
it('returns insufficient_failures when below threshold', function () {
var failures = [
makeFailedCapsule('f1', 'gene_repair', ['error'], 'blast_radius_exceeded', [], []),
];
writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), { version: 1, failed_capsules: failures });
var result = autoDistillFromFailures();
assert.equal(result.ok, false);
assert.equal(result.reason, 'insufficient_failures');
});
it('idempotent skip on repeated calls with same data', function () {
var failures = [];
for (var i = 0; i < 6; i++) {
failures.push(makeFailedCapsule('f' + i, 'gene_repair', ['error'], 'blast_radius', ['problem:reliability'], ['blast_radius']));
}
writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), { version: 1, failed_capsules: failures });
writeJson(path.join(process.env.GEP_ASSETS_DIR, 'genes.json'), { version: 1, genes: [] });
var first = autoDistillFromFailures();
assert.ok(first.ok);
var second = autoDistillFromFailures();
assert.equal(second.ok, false);
assert.equal(second.reason, 'idempotent_skip');
});
});
// ---------------------------------------------------------------------------
// Bench 4: Anti-pattern Avoidance
// ---------------------------------------------------------------------------
describe('bench: anti-pattern avoidance', function () {
it('genes with anti-patterns score lower than clean genes', function () {
var riskyGene = {
type: 'Gene', id: 'gene_risky', category: 'repair',
signals_match: ['error'],
anti_patterns: [
{ mode: 'hard', learning_signals: ['problem:reliability'] },
{ mode: 'hard', learning_signals: ['problem:reliability'] },
],
strategy: ['fix'],
};
var safeGene = {
type: 'Gene', id: 'gene_safe', category: 'repair',
signals_match: ['error'],
learning_history: [
{ outcome: 'success', mode: 'none' },
{ outcome: 'success', mode: 'none' },
],
strategy: ['fix safely'],
};
var result = selectGene([riskyGene, safeGene], ['error'], { effectivePopulationSize: 100 });
assert.ok(result.selected);
assert.equal(result.selected.id, 'gene_safe', 'Should prefer gene without anti-patterns');
});
});
// ---------------------------------------------------------------------------
// Bench 5: shouldDistillFromFailures gate
// ---------------------------------------------------------------------------
describe('bench: shouldDistillFromFailures gate', function () {
beforeEach(setupTempEnv);
afterEach(teardownTempEnv);
it('returns false when FAILURE_DISTILLER=false', function () {
process.env.FAILURE_DISTILLER = 'false';
assert.equal(shouldDistillFromFailures(), false);
delete process.env.FAILURE_DISTILLER;
});
it('returns false when not enough failures', function () {
writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), {
version: 1, failed_capsules: [makeFailedCapsule('f1', 'g', ['e'], 'reason', [], [])],
});
assert.equal(shouldDistillFromFailures(), false);
});
it('returns true when enough failures and no recent distillation', function () {
var failures = [];
for (var i = 0; i < FAILURE_DISTILLER_MIN_CAPSULES + 1; i++) {
failures.push(makeFailedCapsule('f' + i, 'gene_repair', ['error'], 'reason', [], []));
}
writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), { version: 1, failed_capsules: failures });
assert.equal(shouldDistillFromFailures(), true);
});
});