701 lines
16 KiB
JavaScript
701 lines
16 KiB
JavaScript
/**
|
|
* Topic Classifier for OpenClaw Sessions
|
|
*
|
|
* Analyzes session transcript content to:
|
|
* - Match against existing topics
|
|
* - Detect when existing topics don't fit well
|
|
* - Suggest new topic names based on content patterns
|
|
* - Maintain a discovered-topics.json file for learned topics
|
|
*
|
|
* @module topic-classifier
|
|
*/
|
|
|
|
const fs = require("fs");
|
|
const path = require("path");
|
|
const { CONFIG: APP_CONFIG } = require("../src/config");
|
|
|
|
// Default config
|
|
const CONFIG = {
|
|
// Minimum TF-IDF score to consider a term significant
|
|
minTermScore: 0.1,
|
|
// Minimum topic match confidence to consider a match "good"
|
|
matchThreshold: 0.3,
|
|
// Minimum occurrences for a term to be considered
|
|
minTermFrequency: 2,
|
|
// Path to discovered topics state file
|
|
discoveredTopicsPath: path.join(APP_CONFIG.paths.state, "discovered-topics.json"),
|
|
// Maximum suggested topics per classification
|
|
maxSuggestions: 3,
|
|
};
|
|
|
|
// Stop words to filter out (common English words)
|
|
const STOP_WORDS = new Set([
|
|
"a",
|
|
"an",
|
|
"the",
|
|
"and",
|
|
"or",
|
|
"but",
|
|
"in",
|
|
"on",
|
|
"at",
|
|
"to",
|
|
"for",
|
|
"of",
|
|
"with",
|
|
"by",
|
|
"from",
|
|
"up",
|
|
"about",
|
|
"into",
|
|
"through",
|
|
"during",
|
|
"before",
|
|
"after",
|
|
"above",
|
|
"below",
|
|
"between",
|
|
"under",
|
|
"again",
|
|
"further",
|
|
"then",
|
|
"once",
|
|
"here",
|
|
"there",
|
|
"when",
|
|
"where",
|
|
"why",
|
|
"how",
|
|
"all",
|
|
"each",
|
|
"few",
|
|
"more",
|
|
"most",
|
|
"other",
|
|
"some",
|
|
"such",
|
|
"no",
|
|
"nor",
|
|
"not",
|
|
"only",
|
|
"own",
|
|
"same",
|
|
"so",
|
|
"than",
|
|
"too",
|
|
"very",
|
|
"s",
|
|
"t",
|
|
"can",
|
|
"will",
|
|
"just",
|
|
"don",
|
|
"should",
|
|
"now",
|
|
"i",
|
|
"me",
|
|
"my",
|
|
"myself",
|
|
"we",
|
|
"our",
|
|
"ours",
|
|
"you",
|
|
"your",
|
|
"yours",
|
|
"he",
|
|
"him",
|
|
"his",
|
|
"she",
|
|
"her",
|
|
"hers",
|
|
"it",
|
|
"its",
|
|
"they",
|
|
"them",
|
|
"their",
|
|
"theirs",
|
|
"what",
|
|
"which",
|
|
"who",
|
|
"whom",
|
|
"this",
|
|
"that",
|
|
"these",
|
|
"those",
|
|
"am",
|
|
"is",
|
|
"are",
|
|
"was",
|
|
"were",
|
|
"be",
|
|
"been",
|
|
"being",
|
|
"have",
|
|
"has",
|
|
"had",
|
|
"having",
|
|
"do",
|
|
"does",
|
|
"did",
|
|
"doing",
|
|
"would",
|
|
"could",
|
|
"ought",
|
|
"let",
|
|
"like",
|
|
"need",
|
|
"want",
|
|
"got",
|
|
"get",
|
|
"make",
|
|
"made",
|
|
"see",
|
|
"look",
|
|
"think",
|
|
"know",
|
|
"take",
|
|
"come",
|
|
"go",
|
|
"say",
|
|
"said",
|
|
"tell",
|
|
"told",
|
|
"ask",
|
|
"use",
|
|
"used",
|
|
"find",
|
|
"give",
|
|
"gave",
|
|
"yes",
|
|
"no",
|
|
"ok",
|
|
"okay",
|
|
"yeah",
|
|
"sure",
|
|
"right",
|
|
"well",
|
|
"also",
|
|
"just",
|
|
"really",
|
|
"actually",
|
|
"basically",
|
|
"probably",
|
|
"maybe",
|
|
// Tech-common words that are too generic
|
|
"file",
|
|
"code",
|
|
"run",
|
|
"check",
|
|
"help",
|
|
"please",
|
|
"thanks",
|
|
"hello",
|
|
"hi",
|
|
"hey",
|
|
"good",
|
|
"great",
|
|
"nice",
|
|
"cool",
|
|
"awesome",
|
|
"perfect",
|
|
]);
|
|
|
|
// Known topic patterns for seeding - maps keywords to topic names
|
|
const TOPIC_PATTERNS = {
|
|
// Development
|
|
git: "version-control",
|
|
github: "version-control",
|
|
commit: "version-control",
|
|
branch: "version-control",
|
|
merge: "version-control",
|
|
pull: "version-control",
|
|
push: "version-control",
|
|
|
|
debug: "debugging",
|
|
error: "debugging",
|
|
bug: "debugging",
|
|
fix: "debugging",
|
|
stack: "debugging",
|
|
trace: "debugging",
|
|
exception: "debugging",
|
|
|
|
test: "testing",
|
|
unittest: "testing",
|
|
jest: "testing",
|
|
pytest: "testing",
|
|
coverage: "testing",
|
|
|
|
deploy: "deployment",
|
|
production: "deployment",
|
|
staging: "deployment",
|
|
ci: "deployment",
|
|
cd: "deployment",
|
|
pipeline: "deployment",
|
|
|
|
api: "api-integration",
|
|
endpoint: "api-integration",
|
|
rest: "api-integration",
|
|
graphql: "api-integration",
|
|
webhook: "api-integration",
|
|
|
|
database: "database",
|
|
sql: "database",
|
|
postgres: "database",
|
|
mysql: "database",
|
|
mongodb: "database",
|
|
query: "database",
|
|
|
|
docker: "containers",
|
|
kubernetes: "containers",
|
|
k8s: "containers",
|
|
container: "containers",
|
|
pod: "containers",
|
|
|
|
aws: "cloud-infra",
|
|
gcp: "cloud-infra",
|
|
azure: "cloud-infra",
|
|
terraform: "cloud-infra",
|
|
cloudformation: "cloud-infra",
|
|
|
|
// Communication
|
|
slack: "slack-integration",
|
|
channel: "slack-integration",
|
|
message: "messaging",
|
|
email: "email",
|
|
notification: "notifications",
|
|
|
|
// Automation
|
|
cron: "scheduling",
|
|
schedule: "scheduling",
|
|
timer: "scheduling",
|
|
job: "scheduling",
|
|
|
|
script: "automation",
|
|
automate: "automation",
|
|
workflow: "automation",
|
|
|
|
// Research
|
|
research: "research",
|
|
search: "research",
|
|
wikipedia: "research",
|
|
lookup: "research",
|
|
|
|
// Finance
|
|
finance: "finance",
|
|
investment: "finance",
|
|
stock: "finance",
|
|
portfolio: "finance",
|
|
budget: "finance",
|
|
|
|
// System
|
|
config: "configuration",
|
|
settings: "configuration",
|
|
setup: "configuration",
|
|
install: "setup",
|
|
|
|
// Writing
|
|
document: "documentation",
|
|
readme: "documentation",
|
|
docs: "documentation",
|
|
write: "writing",
|
|
draft: "writing",
|
|
|
|
// AI/ML
|
|
model: "ai-ml",
|
|
claude: "ai-ml",
|
|
openai: "ai-ml",
|
|
gpt: "ai-ml",
|
|
llm: "ai-ml",
|
|
prompt: "prompt-engineering",
|
|
|
|
// UI
|
|
dashboard: "dashboard",
|
|
ui: "ui-development",
|
|
frontend: "ui-development",
|
|
css: "ui-development",
|
|
html: "ui-development",
|
|
react: "ui-development",
|
|
};
|
|
|
|
/**
|
|
* Tokenize text into words
|
|
* @param {string} text - Raw text to tokenize
|
|
* @returns {string[]} Array of lowercase tokens
|
|
*/
|
|
function tokenize(text) {
|
|
if (!text || typeof text !== "string") return [];
|
|
|
|
return (
|
|
text
|
|
.toLowerCase()
|
|
// Remove code blocks
|
|
.replace(/```[\s\S]*?```/g, " ")
|
|
// Remove inline code
|
|
.replace(/`[^`]+`/g, " ")
|
|
// Remove URLs
|
|
.replace(/https?:\/\/\S+/g, " ")
|
|
// Remove special characters but keep hyphens in words
|
|
.replace(/[^a-z0-9\s-]/g, " ")
|
|
// Split on whitespace
|
|
.split(/\s+/)
|
|
// Filter valid tokens
|
|
.filter(
|
|
(token) =>
|
|
token.length > 2 && token.length < 30 && !STOP_WORDS.has(token) && !/^\d+$/.test(token),
|
|
)
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Calculate term frequency for a document
|
|
* @param {string[]} tokens - Array of tokens
|
|
* @returns {Map<string, number>} Term frequency map
|
|
*/
|
|
function calculateTF(tokens) {
|
|
const tf = new Map();
|
|
const total = tokens.length || 1;
|
|
|
|
tokens.forEach((token) => {
|
|
tf.set(token, (tf.get(token) || 0) + 1);
|
|
});
|
|
|
|
// Normalize by document length
|
|
tf.forEach((count, term) => {
|
|
tf.set(term, count / total);
|
|
});
|
|
|
|
return tf;
|
|
}
|
|
|
|
/**
|
|
* Calculate inverse document frequency using corpus statistics
|
|
* For a single document, we use term rarity as a proxy
|
|
* @param {Map<string, number>} tf - Term frequency map
|
|
* @param {number} vocabSize - Size of vocabulary
|
|
* @returns {Map<string, number>} IDF scores
|
|
*/
|
|
function calculateIDF(tf, vocabSize) {
|
|
const idf = new Map();
|
|
|
|
tf.forEach((freq, term) => {
|
|
// Boost terms that appear in known patterns
|
|
const patternBoost = TOPIC_PATTERNS[term] ? 2.0 : 1.0;
|
|
// Simple IDF approximation: rarer terms get higher scores
|
|
const score = Math.log(vocabSize / (1 + freq * vocabSize)) * patternBoost;
|
|
idf.set(term, Math.max(0, score));
|
|
});
|
|
|
|
return idf;
|
|
}
|
|
|
|
/**
|
|
* Extract key terms using TF-IDF
|
|
* @param {string} text - Text to analyze
|
|
* @returns {Array<{term: string, score: number}>} Ranked terms
|
|
*/
|
|
function extractKeyTerms(text) {
|
|
const tokens = tokenize(text);
|
|
if (tokens.length === 0) return [];
|
|
|
|
const tf = calculateTF(tokens);
|
|
const idf = calculateIDF(tf, tf.size);
|
|
|
|
const tfidf = [];
|
|
tf.forEach((tfScore, term) => {
|
|
const idfScore = idf.get(term) || 0;
|
|
const score = tfScore * idfScore;
|
|
|
|
// Only include terms that meet minimum thresholds
|
|
const rawCount = tokens.filter((t) => t === term).length;
|
|
if (rawCount >= CONFIG.minTermFrequency && score >= CONFIG.minTermScore) {
|
|
tfidf.push({ term, score, count: rawCount });
|
|
}
|
|
});
|
|
|
|
// Sort by score descending
|
|
return tfidf.sort((a, b) => b.score - a.score);
|
|
}
|
|
|
|
/**
|
|
* Match text against existing topics
|
|
* @param {string} text - Text to match
|
|
* @param {string[]} existingTopics - List of existing topic names
|
|
* @returns {Array<{topic: string, confidence: number}>} Matched topics with confidence
|
|
*/
|
|
function matchTopics(text, existingTopics) {
|
|
const tokens = tokenize(text);
|
|
const matches = new Map();
|
|
|
|
// Score each existing topic
|
|
existingTopics.forEach((topic) => {
|
|
let score = 0;
|
|
const topicTokens = tokenize(topic);
|
|
|
|
// Direct token match
|
|
topicTokens.forEach((tt) => {
|
|
const count = tokens.filter((t) => t === tt || t.includes(tt) || tt.includes(t)).length;
|
|
score += count * 0.3;
|
|
});
|
|
|
|
// Pattern-based matching
|
|
tokens.forEach((token) => {
|
|
const mappedTopic = TOPIC_PATTERNS[token];
|
|
if (mappedTopic === topic) {
|
|
score += 0.5;
|
|
}
|
|
});
|
|
|
|
if (score > 0) {
|
|
// Normalize by text length (log scale to avoid penalizing long texts too much)
|
|
const normalizedScore = score / Math.log2(tokens.length + 2);
|
|
matches.set(topic, Math.min(1, normalizedScore));
|
|
}
|
|
});
|
|
|
|
// Convert to sorted array
|
|
return Array.from(matches.entries())
|
|
.map(([topic, confidence]) => ({ topic, confidence }))
|
|
.sort((a, b) => b.confidence - a.confidence);
|
|
}
|
|
|
|
/**
|
|
* Generate topic suggestions based on content
|
|
* @param {Array<{term: string, score: number}>} keyTerms - Key terms from text
|
|
* @param {string[]} existingTopics - Topics to avoid suggesting
|
|
* @returns {string[]} Suggested new topic names
|
|
*/
|
|
function generateSuggestions(keyTerms, existingTopics) {
|
|
const existingSet = new Set(existingTopics.map((t) => t.toLowerCase()));
|
|
const suggestions = new Set();
|
|
|
|
// Strategy 1: Use known patterns for top terms
|
|
keyTerms.slice(0, 15).forEach(({ term }) => {
|
|
const mapped = TOPIC_PATTERNS[term];
|
|
if (mapped && !existingSet.has(mapped)) {
|
|
suggestions.add(mapped);
|
|
}
|
|
});
|
|
|
|
// Strategy 2: Create compound topics from top co-occurring terms
|
|
if (keyTerms.length >= 2 && suggestions.size < CONFIG.maxSuggestions) {
|
|
const topTerms = keyTerms.slice(0, 5).map((t) => t.term);
|
|
|
|
// Look for related pairs
|
|
const pairs = [
|
|
["api", "integration"],
|
|
["code", "review"],
|
|
["data", "analysis"],
|
|
["error", "handling"],
|
|
["file", "management"],
|
|
["memory", "optimization"],
|
|
["performance", "tuning"],
|
|
["security", "audit"],
|
|
["system", "design"],
|
|
["user", "interface"],
|
|
];
|
|
|
|
pairs.forEach(([a, b]) => {
|
|
if (topTerms.some((t) => t.includes(a)) && topTerms.some((t) => t.includes(b))) {
|
|
const compound = `${a}-${b}`;
|
|
if (!existingSet.has(compound)) {
|
|
suggestions.add(compound);
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
// Strategy 3: Use top-scoring term as-is if it's descriptive enough
|
|
if (suggestions.size < CONFIG.maxSuggestions) {
|
|
keyTerms.slice(0, 5).forEach(({ term, score }) => {
|
|
// Only use single terms that are sufficiently meaningful
|
|
if (score > 0.15 && term.length > 4 && !existingSet.has(term)) {
|
|
suggestions.add(term);
|
|
}
|
|
});
|
|
}
|
|
|
|
return Array.from(suggestions).slice(0, CONFIG.maxSuggestions);
|
|
}
|
|
|
|
/**
|
|
* Load discovered topics from state file
|
|
* @returns {Object} Discovered topics data
|
|
*/
|
|
function loadDiscoveredTopics() {
|
|
try {
|
|
if (fs.existsSync(CONFIG.discoveredTopicsPath)) {
|
|
return JSON.parse(fs.readFileSync(CONFIG.discoveredTopicsPath, "utf8"));
|
|
}
|
|
} catch (e) {
|
|
console.error("Failed to load discovered topics:", e.message);
|
|
}
|
|
|
|
return {
|
|
version: 1,
|
|
topics: {},
|
|
lastUpdated: null,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Save discovered topics to state file
|
|
* @param {Object} data - Topics data to save
|
|
*/
|
|
function saveDiscoveredTopics(data) {
|
|
try {
|
|
data.lastUpdated = new Date().toISOString();
|
|
fs.writeFileSync(CONFIG.discoveredTopicsPath, JSON.stringify(data, null, 2));
|
|
} catch (e) {
|
|
console.error("Failed to save discovered topics:", e.message);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Update discovered topics with new suggestions
|
|
* @param {string[]} suggestions - New topic suggestions
|
|
* @param {string} sessionKey - Source session identifier
|
|
*/
|
|
function updateDiscoveredTopics(suggestions, sessionKey) {
|
|
const data = loadDiscoveredTopics();
|
|
|
|
suggestions.forEach((topic) => {
|
|
if (!data.topics[topic]) {
|
|
data.topics[topic] = {
|
|
firstSeen: new Date().toISOString(),
|
|
occurrences: 0,
|
|
sessions: [],
|
|
};
|
|
}
|
|
|
|
data.topics[topic].occurrences++;
|
|
data.topics[topic].lastSeen = new Date().toISOString();
|
|
|
|
if (!data.topics[topic].sessions.includes(sessionKey)) {
|
|
data.topics[topic].sessions.push(sessionKey);
|
|
// Keep only last 10 sessions
|
|
if (data.topics[topic].sessions.length > 10) {
|
|
data.topics[topic].sessions.shift();
|
|
}
|
|
}
|
|
});
|
|
|
|
saveDiscoveredTopics(data);
|
|
}
|
|
|
|
/**
|
|
* Main classification function
|
|
* Analyzes transcript content to match existing topics and suggest new ones
|
|
*
|
|
* @param {string|Array} transcript - Session transcript (string or array of messages)
|
|
* @param {string[]} existingTopics - List of existing topic names
|
|
* @param {Object} options - Optional configuration
|
|
* @param {string} options.sessionKey - Session identifier for tracking
|
|
* @param {boolean} options.persist - Whether to persist discovered topics (default: true)
|
|
* @returns {{matched: Array<{topic: string, confidence: number}>, suggested: string[], keyTerms: Array}}
|
|
*/
|
|
function classifyAndSuggestTopics(transcript, existingTopics = [], options = {}) {
|
|
// Normalize transcript to text
|
|
let text = "";
|
|
if (Array.isArray(transcript)) {
|
|
text = transcript
|
|
.map((entry) => {
|
|
if (typeof entry === "string") return entry;
|
|
if (entry.text) return entry.text;
|
|
if (entry.message?.content) {
|
|
const content = entry.message.content;
|
|
if (typeof content === "string") return content;
|
|
if (Array.isArray(content)) {
|
|
return content
|
|
.filter((c) => c.type === "text")
|
|
.map((c) => c.text || "")
|
|
.join(" ");
|
|
}
|
|
}
|
|
return "";
|
|
})
|
|
.join("\n");
|
|
} else if (typeof transcript === "string") {
|
|
text = transcript;
|
|
}
|
|
|
|
if (!text || text.length < 20) {
|
|
return { matched: [], suggested: [], keyTerms: [] };
|
|
}
|
|
|
|
// Extract key terms
|
|
const keyTerms = extractKeyTerms(text);
|
|
|
|
// Match against existing topics
|
|
const matched = matchTopics(text, existingTopics);
|
|
|
|
// Determine if we need suggestions
|
|
const bestMatch = matched[0];
|
|
const needsSuggestions = !bestMatch || bestMatch.confidence < CONFIG.matchThreshold;
|
|
|
|
let suggested = [];
|
|
if (needsSuggestions) {
|
|
suggested = generateSuggestions(keyTerms, existingTopics);
|
|
|
|
// Persist discovered topics if enabled
|
|
if (options.persist !== false && suggested.length > 0 && options.sessionKey) {
|
|
updateDiscoveredTopics(suggested, options.sessionKey);
|
|
}
|
|
}
|
|
|
|
return {
|
|
matched: matched.slice(0, 5),
|
|
suggested,
|
|
keyTerms: keyTerms.slice(0, 10),
|
|
confidence: bestMatch?.confidence || 0,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Get all discovered topics sorted by occurrence
|
|
* @returns {Array<{name: string, occurrences: number, sessions: number}>}
|
|
*/
|
|
function getDiscoveredTopics() {
|
|
const data = loadDiscoveredTopics();
|
|
|
|
return Object.entries(data.topics)
|
|
.map(([name, info]) => ({
|
|
name,
|
|
occurrences: info.occurrences,
|
|
sessions: info.sessions?.length || 0,
|
|
firstSeen: info.firstSeen,
|
|
lastSeen: info.lastSeen,
|
|
}))
|
|
.sort((a, b) => b.occurrences - a.occurrences);
|
|
}
|
|
|
|
/**
|
|
* Promote a discovered topic to the official topic list
|
|
* Returns the topic data for external handling
|
|
* @param {string} topicName - Topic to promote
|
|
* @returns {Object|null} Topic data or null if not found
|
|
*/
|
|
function promoteDiscoveredTopic(topicName) {
|
|
const data = loadDiscoveredTopics();
|
|
|
|
if (data.topics[topicName]) {
|
|
const topicData = { ...data.topics[topicName], name: topicName };
|
|
delete data.topics[topicName];
|
|
saveDiscoveredTopics(data);
|
|
return topicData;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
// Export public API
|
|
module.exports = {
|
|
classifyAndSuggestTopics,
|
|
getDiscoveredTopics,
|
|
promoteDiscoveredTopic,
|
|
extractKeyTerms,
|
|
matchTopics,
|
|
// Export config for testing/tuning
|
|
CONFIG,
|
|
TOPIC_PATTERNS,
|
|
};
|