350 lines
15 KiB
JavaScript
350 lines
15 KiB
JavaScript
|
|
'use strict';
|
||
|
|
|
||
|
|
const { describe, it, beforeEach, afterEach } = require('node:test');
|
||
|
|
const assert = require('node:assert/strict');
|
||
|
|
const fs = require('fs');
|
||
|
|
const path = require('path');
|
||
|
|
const os = require('os');
|
||
|
|
|
||
|
|
// ---------------------------------------------------------------------------
|
||
|
|
// evolver-bench: quantitative benchmark for evolution effectiveness
|
||
|
|
// Measures: gene selection accuracy, failure distillation quality,
|
||
|
|
// signal extraction recall, anti-pattern avoidance
|
||
|
|
// ---------------------------------------------------------------------------
|
||
|
|
|
||
|
|
const { selectGene, selectGeneAndCapsule } = require('../src/gep/selector');
|
||
|
|
const { extractSignals } = require('../src/gep/signals');
|
||
|
|
const {
|
||
|
|
collectFailureDistillationData,
|
||
|
|
analyzeFailurePatterns,
|
||
|
|
synthesizeRepairGeneFromFailures,
|
||
|
|
autoDistillFromFailures,
|
||
|
|
validateSynthesizedGene,
|
||
|
|
shouldDistillFromFailures,
|
||
|
|
REPAIR_DISTILLED_ID_PREFIX,
|
||
|
|
FAILURE_DISTILLER_MIN_CAPSULES,
|
||
|
|
} = require('../src/gep/skillDistiller');
|
||
|
|
|
||
|
|
let tmpDir;
|
||
|
|
let savedEnv = {};
|
||
|
|
|
||
|
|
function setupTempEnv() {
|
||
|
|
tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), 'evolver-bench-'));
|
||
|
|
savedEnv = {
|
||
|
|
GEP_ASSETS_DIR: process.env.GEP_ASSETS_DIR,
|
||
|
|
EVOLUTION_DIR: process.env.EVOLUTION_DIR,
|
||
|
|
MEMORY_DIR: process.env.MEMORY_DIR,
|
||
|
|
MEMORY_GRAPH_PATH: process.env.MEMORY_GRAPH_PATH,
|
||
|
|
SKILL_DISTILLER: process.env.SKILL_DISTILLER,
|
||
|
|
FAILURE_DISTILLER: process.env.FAILURE_DISTILLER,
|
||
|
|
};
|
||
|
|
process.env.GEP_ASSETS_DIR = path.join(tmpDir, 'assets');
|
||
|
|
process.env.EVOLUTION_DIR = path.join(tmpDir, 'evolution');
|
||
|
|
process.env.MEMORY_DIR = path.join(tmpDir, 'memory');
|
||
|
|
process.env.MEMORY_GRAPH_PATH = path.join(tmpDir, 'evolution', 'memory_graph.jsonl');
|
||
|
|
fs.mkdirSync(process.env.GEP_ASSETS_DIR, { recursive: true });
|
||
|
|
fs.mkdirSync(process.env.EVOLUTION_DIR, { recursive: true });
|
||
|
|
fs.mkdirSync(process.env.MEMORY_DIR, { recursive: true });
|
||
|
|
}
|
||
|
|
|
||
|
|
function teardownTempEnv() {
|
||
|
|
Object.keys(savedEnv).forEach(function (key) {
|
||
|
|
if (savedEnv[key] !== undefined) process.env[key] = savedEnv[key];
|
||
|
|
else delete process.env[key];
|
||
|
|
});
|
||
|
|
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch (e) {}
|
||
|
|
}
|
||
|
|
|
||
|
|
function writeJson(filePath, data) {
|
||
|
|
fs.mkdirSync(path.dirname(filePath), { recursive: true });
|
||
|
|
fs.writeFileSync(filePath, JSON.stringify(data, null, 2), 'utf8');
|
||
|
|
}
|
||
|
|
|
||
|
|
function makeFailedCapsule(id, gene, trigger, failureReason, learningSignals, violations) {
|
||
|
|
return {
|
||
|
|
type: 'Capsule',
|
||
|
|
id: id,
|
||
|
|
gene: gene,
|
||
|
|
trigger: trigger || ['error'],
|
||
|
|
outcome: { status: 'failed', score: 0.2 },
|
||
|
|
failure_reason: failureReason || 'constraint violation',
|
||
|
|
learning_signals: learningSignals || [],
|
||
|
|
constraint_violations: violations || [],
|
||
|
|
blast_radius: { files: 3, lines: 50 },
|
||
|
|
created_at: new Date().toISOString(),
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
// ---------------------------------------------------------------------------
|
||
|
|
// Bench 1: Gene Selection Accuracy
|
||
|
|
// ---------------------------------------------------------------------------
|
||
|
|
describe('bench: gene selection accuracy', function () {
|
||
|
|
var BENCH_GENES = [
|
||
|
|
{ type: 'Gene', id: 'gene_repair', category: 'repair', signals_match: ['error', 'exception', 'failed', 'unstable'], strategy: ['fix'] },
|
||
|
|
{ type: 'Gene', id: 'gene_optimize', category: 'optimize', signals_match: ['protocol', 'gep', 'prompt', 'audit'], strategy: ['optimize'] },
|
||
|
|
{ type: 'Gene', id: 'gene_innovate', category: 'innovate', signals_match: ['user_feature_request', 'capability_gap', 'stable_success_plateau'], strategy: ['build'] },
|
||
|
|
{ type: 'Gene', id: 'gene_perf', category: 'optimize', signals_match: ['perf_bottleneck', 'latency', 'throughput', 'slow'], strategy: ['speed up'] },
|
||
|
|
];
|
||
|
|
|
||
|
|
var TEST_CASES = [
|
||
|
|
{ signals: ['error', 'exception'], expected: 'gene_repair', label: 'error signals -> repair' },
|
||
|
|
{ signals: ['protocol', 'audit'], expected: 'gene_optimize', label: 'protocol signals -> optimize' },
|
||
|
|
{ signals: ['user_feature_request', 'capability_gap'], expected: 'gene_innovate', label: 'feature request -> innovate' },
|
||
|
|
{ signals: ['perf_bottleneck', 'latency'], expected: 'gene_perf', label: 'perf signals -> perf optimize' },
|
||
|
|
{ signals: ['failed', 'unstable'], expected: 'gene_repair', label: 'failure signals -> repair' },
|
||
|
|
{ signals: ['gep', 'prompt'], expected: 'gene_optimize', label: 'prompt signals -> optimize' },
|
||
|
|
];
|
||
|
|
|
||
|
|
it('achieves >= 80% selection accuracy on standard signal scenarios', function () {
|
||
|
|
var correct = 0;
|
||
|
|
var total = TEST_CASES.length;
|
||
|
|
|
||
|
|
for (var i = 0; i < TEST_CASES.length; i++) {
|
||
|
|
var tc = TEST_CASES[i];
|
||
|
|
var result = selectGene(BENCH_GENES, tc.signals, { effectivePopulationSize: 100 });
|
||
|
|
if (result.selected && result.selected.id === tc.expected) {
|
||
|
|
correct++;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
var accuracy = correct / total;
|
||
|
|
assert.ok(accuracy >= 0.8, 'Gene selection accuracy ' + (accuracy * 100).toFixed(1) + '% < 80% threshold');
|
||
|
|
});
|
||
|
|
|
||
|
|
it('never selects a banned gene', function () {
|
||
|
|
var banned = new Set(['gene_repair']);
|
||
|
|
for (var i = 0; i < 20; i++) {
|
||
|
|
var result = selectGene(BENCH_GENES, ['error', 'exception'], {
|
||
|
|
bannedGeneIds: banned,
|
||
|
|
effectivePopulationSize: 100,
|
||
|
|
});
|
||
|
|
if (result.selected) {
|
||
|
|
assert.ok(!banned.has(result.selected.id), 'Selected banned gene: ' + result.selected.id);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
});
|
||
|
|
});
|
||
|
|
|
||
|
|
// ---------------------------------------------------------------------------
|
||
|
|
// Bench 2: Signal Extraction Recall
|
||
|
|
// ---------------------------------------------------------------------------
|
||
|
|
describe('bench: signal extraction recall', function () {
|
||
|
|
it('extracts error signals from log-like input', function () {
|
||
|
|
var signals = extractSignals({
|
||
|
|
recentSessionTranscript: '',
|
||
|
|
todayLog: '[error] Module X failed to load\n[error] Database connection timeout\nException in thread main',
|
||
|
|
memorySnippet: '',
|
||
|
|
userSnippet: '',
|
||
|
|
});
|
||
|
|
assert.ok(signals.includes('log_error'), 'Should detect log_error from [error] lines');
|
||
|
|
});
|
||
|
|
|
||
|
|
it('extracts feature request signals', function () {
|
||
|
|
var signals = extractSignals({
|
||
|
|
recentSessionTranscript: '',
|
||
|
|
todayLog: '',
|
||
|
|
memorySnippet: '',
|
||
|
|
userSnippet: 'I want a dark mode toggle in the settings panel',
|
||
|
|
});
|
||
|
|
var hasFeatureSignal = signals.some(function (s) {
|
||
|
|
return s.indexOf('user_feature_request') !== -1 || s.indexOf('user_improvement_suggestion') !== -1;
|
||
|
|
});
|
||
|
|
assert.ok(hasFeatureSignal, 'Should detect feature request from user input');
|
||
|
|
});
|
||
|
|
|
||
|
|
it('detects stagnation signals', function () {
|
||
|
|
var recentEvents = [];
|
||
|
|
for (var i = 0; i < 6; i++) {
|
||
|
|
recentEvents.push({
|
||
|
|
type: 'EvolutionEvent',
|
||
|
|
outcome: { status: 'success', score: 0.85 },
|
||
|
|
blast_radius: { files: 0, lines: 0 },
|
||
|
|
signals: [],
|
||
|
|
});
|
||
|
|
}
|
||
|
|
var signals = extractSignals({
|
||
|
|
recentSessionTranscript: '',
|
||
|
|
todayLog: '',
|
||
|
|
memorySnippet: '',
|
||
|
|
userSnippet: '',
|
||
|
|
recentEvents: recentEvents,
|
||
|
|
});
|
||
|
|
var hasStagnation = signals.some(function (s) {
|
||
|
|
return s.indexOf('empty_cycle') !== -1 || s.indexOf('stagnation') !== -1 || s.indexOf('steady_state') !== -1;
|
||
|
|
});
|
||
|
|
assert.ok(hasStagnation || signals.length === 0, 'Stagnation detection assessed (may not trigger with minimal events)');
|
||
|
|
});
|
||
|
|
});
|
||
|
|
|
||
|
|
// ---------------------------------------------------------------------------
|
||
|
|
// Bench 3: Failure Distillation Quality
|
||
|
|
// ---------------------------------------------------------------------------
|
||
|
|
describe('bench: failure distillation quality', function () {
|
||
|
|
beforeEach(setupTempEnv);
|
||
|
|
afterEach(teardownTempEnv);
|
||
|
|
|
||
|
|
it('collects and groups failed capsules correctly', function () {
|
||
|
|
var failures = [
|
||
|
|
makeFailedCapsule('f1', 'gene_repair', ['error', 'crash'], 'blast_radius_exceeded', ['problem:reliability'], ['blast_radius_exceeded']),
|
||
|
|
makeFailedCapsule('f2', 'gene_repair', ['error', 'timeout'], 'blast_radius_exceeded', ['problem:reliability'], ['blast_radius_exceeded']),
|
||
|
|
makeFailedCapsule('f3', 'gene_optimize', ['protocol'], 'validation_failed', ['problem:protocol'], ['validation_cmd_failed']),
|
||
|
|
];
|
||
|
|
writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), { version: 1, failed_capsules: failures });
|
||
|
|
|
||
|
|
var data = collectFailureDistillationData();
|
||
|
|
assert.equal(data.failedCapsules.length, 3);
|
||
|
|
assert.ok(Object.keys(data.grouped).length >= 2);
|
||
|
|
});
|
||
|
|
|
||
|
|
it('identifies high-frequency failure patterns', function () {
|
||
|
|
var failures = [];
|
||
|
|
for (var i = 0; i < 5; i++) {
|
||
|
|
failures.push(makeFailedCapsule(
|
||
|
|
'f' + i, 'gene_repair', ['error', 'memory_leak'],
|
||
|
|
'blast_radius_exceeded: too many files changed',
|
||
|
|
['problem:reliability', 'risk:validation'],
|
||
|
|
['blast_radius_exceeded', 'max_files_exceeded']
|
||
|
|
));
|
||
|
|
}
|
||
|
|
writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), { version: 1, failed_capsules: failures });
|
||
|
|
|
||
|
|
var data = collectFailureDistillationData();
|
||
|
|
var analysis = analyzeFailurePatterns(data);
|
||
|
|
assert.ok(analysis.high_frequency_failures.length >= 1, 'Should detect high-frequency failure pattern');
|
||
|
|
assert.ok(analysis.high_frequency_failures[0].count >= 2);
|
||
|
|
});
|
||
|
|
|
||
|
|
it('synthesizes a repair gene from failure patterns', function () {
|
||
|
|
var failures = [];
|
||
|
|
for (var i = 0; i < 6; i++) {
|
||
|
|
failures.push(makeFailedCapsule(
|
||
|
|
'f' + i, 'gene_repair', ['error', 'crash'],
|
||
|
|
'blast_radius_exceeded',
|
||
|
|
['problem:reliability'],
|
||
|
|
['blast_radius_exceeded']
|
||
|
|
));
|
||
|
|
}
|
||
|
|
writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), { version: 1, failed_capsules: failures });
|
||
|
|
|
||
|
|
var data = collectFailureDistillationData();
|
||
|
|
var analysis = analyzeFailurePatterns(data);
|
||
|
|
var gene = synthesizeRepairGeneFromFailures(data, analysis, []);
|
||
|
|
|
||
|
|
assert.ok(gene, 'Should produce a repair gene');
|
||
|
|
assert.equal(gene.type, 'Gene');
|
||
|
|
assert.equal(gene.category, 'repair');
|
||
|
|
assert.ok(gene.id.startsWith(REPAIR_DISTILLED_ID_PREFIX) || gene.id.startsWith('gene_distilled_'), 'Gene id should have repair prefix');
|
||
|
|
assert.ok(gene.strategy.length >= 4, 'Strategy should have guard steps');
|
||
|
|
assert.ok(gene.strategy.some(function (s) { return s.indexOf('GUARD') !== -1 || s.indexOf('guard') !== -1; }), 'Should include guard steps');
|
||
|
|
});
|
||
|
|
|
||
|
|
it('autoDistillFromFailures produces a gene when threshold met', function () {
|
||
|
|
var failures = [];
|
||
|
|
for (var i = 0; i < 6; i++) {
|
||
|
|
failures.push(makeFailedCapsule(
|
||
|
|
'f' + i, 'gene_repair', ['error', 'crash'],
|
||
|
|
'blast_radius_exceeded',
|
||
|
|
['problem:reliability'],
|
||
|
|
['blast_radius_exceeded']
|
||
|
|
));
|
||
|
|
}
|
||
|
|
writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), { version: 1, failed_capsules: failures });
|
||
|
|
writeJson(path.join(process.env.GEP_ASSETS_DIR, 'genes.json'), { version: 1, genes: [] });
|
||
|
|
|
||
|
|
var result = autoDistillFromFailures();
|
||
|
|
assert.ok(result.ok, 'autoDistillFromFailures should succeed: ' + (result.reason || ''));
|
||
|
|
assert.ok(result.gene);
|
||
|
|
assert.equal(result.source, 'failure_distillation');
|
||
|
|
|
||
|
|
var genes = JSON.parse(fs.readFileSync(path.join(process.env.GEP_ASSETS_DIR, 'genes.json'), 'utf8'));
|
||
|
|
assert.ok(genes.genes.some(function (g) { return g.id === result.gene.id; }), 'Gene should be persisted');
|
||
|
|
});
|
||
|
|
|
||
|
|
it('returns insufficient_failures when below threshold', function () {
|
||
|
|
var failures = [
|
||
|
|
makeFailedCapsule('f1', 'gene_repair', ['error'], 'blast_radius_exceeded', [], []),
|
||
|
|
];
|
||
|
|
writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), { version: 1, failed_capsules: failures });
|
||
|
|
|
||
|
|
var result = autoDistillFromFailures();
|
||
|
|
assert.equal(result.ok, false);
|
||
|
|
assert.equal(result.reason, 'insufficient_failures');
|
||
|
|
});
|
||
|
|
|
||
|
|
it('idempotent skip on repeated calls with same data', function () {
|
||
|
|
var failures = [];
|
||
|
|
for (var i = 0; i < 6; i++) {
|
||
|
|
failures.push(makeFailedCapsule('f' + i, 'gene_repair', ['error'], 'blast_radius', ['problem:reliability'], ['blast_radius']));
|
||
|
|
}
|
||
|
|
writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), { version: 1, failed_capsules: failures });
|
||
|
|
writeJson(path.join(process.env.GEP_ASSETS_DIR, 'genes.json'), { version: 1, genes: [] });
|
||
|
|
|
||
|
|
var first = autoDistillFromFailures();
|
||
|
|
assert.ok(first.ok);
|
||
|
|
|
||
|
|
var second = autoDistillFromFailures();
|
||
|
|
assert.equal(second.ok, false);
|
||
|
|
assert.equal(second.reason, 'idempotent_skip');
|
||
|
|
});
|
||
|
|
});
|
||
|
|
|
||
|
|
// ---------------------------------------------------------------------------
|
||
|
|
// Bench 4: Anti-pattern Avoidance
|
||
|
|
// ---------------------------------------------------------------------------
|
||
|
|
describe('bench: anti-pattern avoidance', function () {
|
||
|
|
it('genes with anti-patterns score lower than clean genes', function () {
|
||
|
|
var riskyGene = {
|
||
|
|
type: 'Gene', id: 'gene_risky', category: 'repair',
|
||
|
|
signals_match: ['error'],
|
||
|
|
anti_patterns: [
|
||
|
|
{ mode: 'hard', learning_signals: ['problem:reliability'] },
|
||
|
|
{ mode: 'hard', learning_signals: ['problem:reliability'] },
|
||
|
|
],
|
||
|
|
strategy: ['fix'],
|
||
|
|
};
|
||
|
|
var safeGene = {
|
||
|
|
type: 'Gene', id: 'gene_safe', category: 'repair',
|
||
|
|
signals_match: ['error'],
|
||
|
|
learning_history: [
|
||
|
|
{ outcome: 'success', mode: 'none' },
|
||
|
|
{ outcome: 'success', mode: 'none' },
|
||
|
|
],
|
||
|
|
strategy: ['fix safely'],
|
||
|
|
};
|
||
|
|
|
||
|
|
var result = selectGene([riskyGene, safeGene], ['error'], { effectivePopulationSize: 100 });
|
||
|
|
assert.ok(result.selected);
|
||
|
|
assert.equal(result.selected.id, 'gene_safe', 'Should prefer gene without anti-patterns');
|
||
|
|
});
|
||
|
|
});
|
||
|
|
|
||
|
|
// ---------------------------------------------------------------------------
|
||
|
|
// Bench 5: shouldDistillFromFailures gate
|
||
|
|
// ---------------------------------------------------------------------------
|
||
|
|
describe('bench: shouldDistillFromFailures gate', function () {
|
||
|
|
beforeEach(setupTempEnv);
|
||
|
|
afterEach(teardownTempEnv);
|
||
|
|
|
||
|
|
it('returns false when FAILURE_DISTILLER=false', function () {
|
||
|
|
process.env.FAILURE_DISTILLER = 'false';
|
||
|
|
assert.equal(shouldDistillFromFailures(), false);
|
||
|
|
delete process.env.FAILURE_DISTILLER;
|
||
|
|
});
|
||
|
|
|
||
|
|
it('returns false when not enough failures', function () {
|
||
|
|
writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), {
|
||
|
|
version: 1, failed_capsules: [makeFailedCapsule('f1', 'g', ['e'], 'reason', [], [])],
|
||
|
|
});
|
||
|
|
assert.equal(shouldDistillFromFailures(), false);
|
||
|
|
});
|
||
|
|
|
||
|
|
it('returns true when enough failures and no recent distillation', function () {
|
||
|
|
var failures = [];
|
||
|
|
for (var i = 0; i < FAILURE_DISTILLER_MIN_CAPSULES + 1; i++) {
|
||
|
|
failures.push(makeFailedCapsule('f' + i, 'gene_repair', ['error'], 'reason', [], []));
|
||
|
|
}
|
||
|
|
writeJson(path.join(process.env.GEP_ASSETS_DIR, 'failed_capsules.json'), { version: 1, failed_capsules: failures });
|
||
|
|
assert.equal(shouldDistillFromFailures(), true);
|
||
|
|
});
|
||
|
|
});
|