/** * Topic Classifier for OpenClaw Sessions * * Analyzes session transcript content to: * - Match against existing topics * - Detect when existing topics don't fit well * - Suggest new topic names based on content patterns * - Maintain a discovered-topics.json file for learned topics * * @module topic-classifier */ const fs = require("fs"); const path = require("path"); const { CONFIG: APP_CONFIG } = require("../src/config"); // Default config const CONFIG = { // Minimum TF-IDF score to consider a term significant minTermScore: 0.1, // Minimum topic match confidence to consider a match "good" matchThreshold: 0.3, // Minimum occurrences for a term to be considered minTermFrequency: 2, // Path to discovered topics state file discoveredTopicsPath: path.join(APP_CONFIG.paths.state, "discovered-topics.json"), // Maximum suggested topics per classification maxSuggestions: 3, }; // Stop words to filter out (common English words) const STOP_WORDS = new Set([ "a", "an", "the", "and", "or", "but", "in", "on", "at", "to", "for", "of", "with", "by", "from", "up", "about", "into", "through", "during", "before", "after", "above", "below", "between", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now", "i", "me", "my", "myself", "we", "our", "ours", "you", "your", "yours", "he", "him", "his", "she", "her", "hers", "it", "its", "they", "them", "their", "theirs", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "would", "could", "ought", "let", "like", "need", "want", "got", "get", "make", "made", "see", "look", "think", "know", "take", "come", "go", "say", "said", "tell", "told", "ask", "use", "used", "find", "give", "gave", "yes", "no", "ok", "okay", "yeah", "sure", "right", "well", "also", "just", "really", "actually", "basically", "probably", "maybe", // Tech-common words that are too generic "file", "code", "run", "check", "help", "please", "thanks", "hello", "hi", "hey", "good", "great", "nice", "cool", "awesome", "perfect", ]); // Known topic patterns for seeding - maps keywords to topic names const TOPIC_PATTERNS = { // Development git: "version-control", github: "version-control", commit: "version-control", branch: "version-control", merge: "version-control", pull: "version-control", push: "version-control", debug: "debugging", error: "debugging", bug: "debugging", fix: "debugging", stack: "debugging", trace: "debugging", exception: "debugging", test: "testing", unittest: "testing", jest: "testing", pytest: "testing", coverage: "testing", deploy: "deployment", production: "deployment", staging: "deployment", ci: "deployment", cd: "deployment", pipeline: "deployment", api: "api-integration", endpoint: "api-integration", rest: "api-integration", graphql: "api-integration", webhook: "api-integration", database: "database", sql: "database", postgres: "database", mysql: "database", mongodb: "database", query: "database", docker: "containers", kubernetes: "containers", k8s: "containers", container: "containers", pod: "containers", aws: "cloud-infra", gcp: "cloud-infra", azure: "cloud-infra", terraform: "cloud-infra", cloudformation: "cloud-infra", // Communication slack: "slack-integration", channel: "slack-integration", message: "messaging", email: "email", notification: "notifications", // Automation cron: "scheduling", schedule: "scheduling", timer: "scheduling", job: "scheduling", script: "automation", automate: "automation", workflow: "automation", // Research research: "research", search: "research", wikipedia: "research", lookup: "research", // Finance finance: "finance", investment: "finance", stock: "finance", portfolio: "finance", budget: "finance", // System config: "configuration", settings: "configuration", setup: "configuration", install: "setup", // Writing document: "documentation", readme: "documentation", docs: "documentation", write: "writing", draft: "writing", // AI/ML model: "ai-ml", claude: "ai-ml", openai: "ai-ml", gpt: "ai-ml", llm: "ai-ml", prompt: "prompt-engineering", // UI dashboard: "dashboard", ui: "ui-development", frontend: "ui-development", css: "ui-development", html: "ui-development", react: "ui-development", }; /** * Tokenize text into words * @param {string} text - Raw text to tokenize * @returns {string[]} Array of lowercase tokens */ function tokenize(text) { if (!text || typeof text !== "string") return []; return ( text .toLowerCase() // Remove code blocks .replace(/```[\s\S]*?```/g, " ") // Remove inline code .replace(/`[^`]+`/g, " ") // Remove URLs .replace(/https?:\/\/\S+/g, " ") // Remove special characters but keep hyphens in words .replace(/[^a-z0-9\s-]/g, " ") // Split on whitespace .split(/\s+/) // Filter valid tokens .filter( (token) => token.length > 2 && token.length < 30 && !STOP_WORDS.has(token) && !/^\d+$/.test(token), ) ); } /** * Calculate term frequency for a document * @param {string[]} tokens - Array of tokens * @returns {Map} Term frequency map */ function calculateTF(tokens) { const tf = new Map(); const total = tokens.length || 1; tokens.forEach((token) => { tf.set(token, (tf.get(token) || 0) + 1); }); // Normalize by document length tf.forEach((count, term) => { tf.set(term, count / total); }); return tf; } /** * Calculate inverse document frequency using corpus statistics * For a single document, we use term rarity as a proxy * @param {Map} tf - Term frequency map * @param {number} vocabSize - Size of vocabulary * @returns {Map} IDF scores */ function calculateIDF(tf, vocabSize) { const idf = new Map(); tf.forEach((freq, term) => { // Boost terms that appear in known patterns const patternBoost = TOPIC_PATTERNS[term] ? 2.0 : 1.0; // Simple IDF approximation: rarer terms get higher scores const score = Math.log(vocabSize / (1 + freq * vocabSize)) * patternBoost; idf.set(term, Math.max(0, score)); }); return idf; } /** * Extract key terms using TF-IDF * @param {string} text - Text to analyze * @returns {Array<{term: string, score: number}>} Ranked terms */ function extractKeyTerms(text) { const tokens = tokenize(text); if (tokens.length === 0) return []; const tf = calculateTF(tokens); const idf = calculateIDF(tf, tf.size); const tfidf = []; tf.forEach((tfScore, term) => { const idfScore = idf.get(term) || 0; const score = tfScore * idfScore; // Only include terms that meet minimum thresholds const rawCount = tokens.filter((t) => t === term).length; if (rawCount >= CONFIG.minTermFrequency && score >= CONFIG.minTermScore) { tfidf.push({ term, score, count: rawCount }); } }); // Sort by score descending return tfidf.sort((a, b) => b.score - a.score); } /** * Match text against existing topics * @param {string} text - Text to match * @param {string[]} existingTopics - List of existing topic names * @returns {Array<{topic: string, confidence: number}>} Matched topics with confidence */ function matchTopics(text, existingTopics) { const tokens = tokenize(text); const matches = new Map(); // Score each existing topic existingTopics.forEach((topic) => { let score = 0; const topicTokens = tokenize(topic); // Direct token match topicTokens.forEach((tt) => { const count = tokens.filter((t) => t === tt || t.includes(tt) || tt.includes(t)).length; score += count * 0.3; }); // Pattern-based matching tokens.forEach((token) => { const mappedTopic = TOPIC_PATTERNS[token]; if (mappedTopic === topic) { score += 0.5; } }); if (score > 0) { // Normalize by text length (log scale to avoid penalizing long texts too much) const normalizedScore = score / Math.log2(tokens.length + 2); matches.set(topic, Math.min(1, normalizedScore)); } }); // Convert to sorted array return Array.from(matches.entries()) .map(([topic, confidence]) => ({ topic, confidence })) .sort((a, b) => b.confidence - a.confidence); } /** * Generate topic suggestions based on content * @param {Array<{term: string, score: number}>} keyTerms - Key terms from text * @param {string[]} existingTopics - Topics to avoid suggesting * @returns {string[]} Suggested new topic names */ function generateSuggestions(keyTerms, existingTopics) { const existingSet = new Set(existingTopics.map((t) => t.toLowerCase())); const suggestions = new Set(); // Strategy 1: Use known patterns for top terms keyTerms.slice(0, 15).forEach(({ term }) => { const mapped = TOPIC_PATTERNS[term]; if (mapped && !existingSet.has(mapped)) { suggestions.add(mapped); } }); // Strategy 2: Create compound topics from top co-occurring terms if (keyTerms.length >= 2 && suggestions.size < CONFIG.maxSuggestions) { const topTerms = keyTerms.slice(0, 5).map((t) => t.term); // Look for related pairs const pairs = [ ["api", "integration"], ["code", "review"], ["data", "analysis"], ["error", "handling"], ["file", "management"], ["memory", "optimization"], ["performance", "tuning"], ["security", "audit"], ["system", "design"], ["user", "interface"], ]; pairs.forEach(([a, b]) => { if (topTerms.some((t) => t.includes(a)) && topTerms.some((t) => t.includes(b))) { const compound = `${a}-${b}`; if (!existingSet.has(compound)) { suggestions.add(compound); } } }); } // Strategy 3: Use top-scoring term as-is if it's descriptive enough if (suggestions.size < CONFIG.maxSuggestions) { keyTerms.slice(0, 5).forEach(({ term, score }) => { // Only use single terms that are sufficiently meaningful if (score > 0.15 && term.length > 4 && !existingSet.has(term)) { suggestions.add(term); } }); } return Array.from(suggestions).slice(0, CONFIG.maxSuggestions); } /** * Load discovered topics from state file * @returns {Object} Discovered topics data */ function loadDiscoveredTopics() { try { if (fs.existsSync(CONFIG.discoveredTopicsPath)) { return JSON.parse(fs.readFileSync(CONFIG.discoveredTopicsPath, "utf8")); } } catch (e) { console.error("Failed to load discovered topics:", e.message); } return { version: 1, topics: {}, lastUpdated: null, }; } /** * Save discovered topics to state file * @param {Object} data - Topics data to save */ function saveDiscoveredTopics(data) { try { data.lastUpdated = new Date().toISOString(); fs.writeFileSync(CONFIG.discoveredTopicsPath, JSON.stringify(data, null, 2)); } catch (e) { console.error("Failed to save discovered topics:", e.message); } } /** * Update discovered topics with new suggestions * @param {string[]} suggestions - New topic suggestions * @param {string} sessionKey - Source session identifier */ function updateDiscoveredTopics(suggestions, sessionKey) { const data = loadDiscoveredTopics(); suggestions.forEach((topic) => { if (!data.topics[topic]) { data.topics[topic] = { firstSeen: new Date().toISOString(), occurrences: 0, sessions: [], }; } data.topics[topic].occurrences++; data.topics[topic].lastSeen = new Date().toISOString(); if (!data.topics[topic].sessions.includes(sessionKey)) { data.topics[topic].sessions.push(sessionKey); // Keep only last 10 sessions if (data.topics[topic].sessions.length > 10) { data.topics[topic].sessions.shift(); } } }); saveDiscoveredTopics(data); } /** * Main classification function * Analyzes transcript content to match existing topics and suggest new ones * * @param {string|Array} transcript - Session transcript (string or array of messages) * @param {string[]} existingTopics - List of existing topic names * @param {Object} options - Optional configuration * @param {string} options.sessionKey - Session identifier for tracking * @param {boolean} options.persist - Whether to persist discovered topics (default: true) * @returns {{matched: Array<{topic: string, confidence: number}>, suggested: string[], keyTerms: Array}} */ function classifyAndSuggestTopics(transcript, existingTopics = [], options = {}) { // Normalize transcript to text let text = ""; if (Array.isArray(transcript)) { text = transcript .map((entry) => { if (typeof entry === "string") return entry; if (entry.text) return entry.text; if (entry.message?.content) { const content = entry.message.content; if (typeof content === "string") return content; if (Array.isArray(content)) { return content .filter((c) => c.type === "text") .map((c) => c.text || "") .join(" "); } } return ""; }) .join("\n"); } else if (typeof transcript === "string") { text = transcript; } if (!text || text.length < 20) { return { matched: [], suggested: [], keyTerms: [] }; } // Extract key terms const keyTerms = extractKeyTerms(text); // Match against existing topics const matched = matchTopics(text, existingTopics); // Determine if we need suggestions const bestMatch = matched[0]; const needsSuggestions = !bestMatch || bestMatch.confidence < CONFIG.matchThreshold; let suggested = []; if (needsSuggestions) { suggested = generateSuggestions(keyTerms, existingTopics); // Persist discovered topics if enabled if (options.persist !== false && suggested.length > 0 && options.sessionKey) { updateDiscoveredTopics(suggested, options.sessionKey); } } return { matched: matched.slice(0, 5), suggested, keyTerms: keyTerms.slice(0, 10), confidence: bestMatch?.confidence || 0, }; } /** * Get all discovered topics sorted by occurrence * @returns {Array<{name: string, occurrences: number, sessions: number}>} */ function getDiscoveredTopics() { const data = loadDiscoveredTopics(); return Object.entries(data.topics) .map(([name, info]) => ({ name, occurrences: info.occurrences, sessions: info.sessions?.length || 0, firstSeen: info.firstSeen, lastSeen: info.lastSeen, })) .sort((a, b) => b.occurrences - a.occurrences); } /** * Promote a discovered topic to the official topic list * Returns the topic data for external handling * @param {string} topicName - Topic to promote * @returns {Object|null} Topic data or null if not found */ function promoteDiscoveredTopic(topicName) { const data = loadDiscoveredTopics(); if (data.topics[topicName]) { const topicData = { ...data.topics[topicName], name: topicName }; delete data.topics[topicName]; saveDiscoveredTopics(data); return topicData; } return null; } // Export public API module.exports = { classifyAndSuggestTopics, getDiscoveredTopics, promoteDiscoveredTopic, extractKeyTerms, matchTopics, // Export config for testing/tuning CONFIG, TOPIC_PATTERNS, };