Initial commit with translated description

2026-03-29 10:19:19 +08:00
commit 5aa1f324c6
81 changed files with 27526 additions and 0 deletions
--- a/scripts/topic-classifier.js
+++ b/scripts/topic-classifier.js
@@ -0,0 +1,700 @@
+/**
+ * Topic Classifier for OpenClaw Sessions
+ *
+ * Analyzes session transcript content to:
+ * - Match against existing topics
+ * - Detect when existing topics don't fit well
+ * - Suggest new topic names based on content patterns
+ * - Maintain a discovered-topics.json file for learned topics
+ *
+ * @module topic-classifier
+ */
+
+const fs = require("fs");
+const path = require("path");
+const { CONFIG: APP_CONFIG } = require("../src/config");
+
+// Default config
+const CONFIG = {
+  // Minimum TF-IDF score to consider a term significant
+  minTermScore: 0.1,
+  // Minimum topic match confidence to consider a match "good"
+  matchThreshold: 0.3,
+  // Minimum occurrences for a term to be considered
+  minTermFrequency: 2,
+  // Path to discovered topics state file
+  discoveredTopicsPath: path.join(APP_CONFIG.paths.state, "discovered-topics.json"),
+  // Maximum suggested topics per classification
+  maxSuggestions: 3,
+};
+
+// Stop words to filter out (common English words)
+const STOP_WORDS = new Set([
+  "a",
+  "an",
+  "the",
+  "and",
+  "or",
+  "but",
+  "in",
+  "on",
+  "at",
+  "to",
+  "for",
+  "of",
+  "with",
+  "by",
+  "from",
+  "up",
+  "about",
+  "into",
+  "through",
+  "during",
+  "before",
+  "after",
+  "above",
+  "below",
+  "between",
+  "under",
+  "again",
+  "further",
+  "then",
+  "once",
+  "here",
+  "there",
+  "when",
+  "where",
+  "why",
+  "how",
+  "all",
+  "each",
+  "few",
+  "more",
+  "most",
+  "other",
+  "some",
+  "such",
+  "no",
+  "nor",
+  "not",
+  "only",
+  "own",
+  "same",
+  "so",
+  "than",
+  "too",
+  "very",
+  "s",
+  "t",
+  "can",
+  "will",
+  "just",
+  "don",
+  "should",
+  "now",
+  "i",
+  "me",
+  "my",
+  "myself",
+  "we",
+  "our",
+  "ours",
+  "you",
+  "your",
+  "yours",
+  "he",
+  "him",
+  "his",
+  "she",
+  "her",
+  "hers",
+  "it",
+  "its",
+  "they",
+  "them",
+  "their",
+  "theirs",
+  "what",
+  "which",
+  "who",
+  "whom",
+  "this",
+  "that",
+  "these",
+  "those",
+  "am",
+  "is",
+  "are",
+  "was",
+  "were",
+  "be",
+  "been",
+  "being",
+  "have",
+  "has",
+  "had",
+  "having",
+  "do",
+  "does",
+  "did",
+  "doing",
+  "would",
+  "could",
+  "ought",
+  "let",
+  "like",
+  "need",
+  "want",
+  "got",
+  "get",
+  "make",
+  "made",
+  "see",
+  "look",
+  "think",
+  "know",
+  "take",
+  "come",
+  "go",
+  "say",
+  "said",
+  "tell",
+  "told",
+  "ask",
+  "use",
+  "used",
+  "find",
+  "give",
+  "gave",
+  "yes",
+  "no",
+  "ok",
+  "okay",
+  "yeah",
+  "sure",
+  "right",
+  "well",
+  "also",
+  "just",
+  "really",
+  "actually",
+  "basically",
+  "probably",
+  "maybe",
+  // Tech-common words that are too generic
+  "file",
+  "code",
+  "run",
+  "check",
+  "help",
+  "please",
+  "thanks",
+  "hello",
+  "hi",
+  "hey",
+  "good",
+  "great",
+  "nice",
+  "cool",
+  "awesome",
+  "perfect",
+]);
+
+// Known topic patterns for seeding - maps keywords to topic names
+const TOPIC_PATTERNS = {
+  // Development
+  git: "version-control",
+  github: "version-control",
+  commit: "version-control",
+  branch: "version-control",
+  merge: "version-control",
+  pull: "version-control",
+  push: "version-control",
+
+  debug: "debugging",
+  error: "debugging",
+  bug: "debugging",
+  fix: "debugging",
+  stack: "debugging",
+  trace: "debugging",
+  exception: "debugging",
+
+  test: "testing",
+  unittest: "testing",
+  jest: "testing",
+  pytest: "testing",
+  coverage: "testing",
+
+  deploy: "deployment",
+  production: "deployment",
+  staging: "deployment",
+  ci: "deployment",
+  cd: "deployment",
+  pipeline: "deployment",
+
+  api: "api-integration",
+  endpoint: "api-integration",
+  rest: "api-integration",
+  graphql: "api-integration",
+  webhook: "api-integration",
+
+  database: "database",
+  sql: "database",
+  postgres: "database",
+  mysql: "database",
+  mongodb: "database",
+  query: "database",
+
+  docker: "containers",
+  kubernetes: "containers",
+  k8s: "containers",
+  container: "containers",
+  pod: "containers",
+
+  aws: "cloud-infra",
+  gcp: "cloud-infra",
+  azure: "cloud-infra",
+  terraform: "cloud-infra",
+  cloudformation: "cloud-infra",
+
+  // Communication
+  slack: "slack-integration",
+  channel: "slack-integration",
+  message: "messaging",
+  email: "email",
+  notification: "notifications",
+
+  // Automation
+  cron: "scheduling",
+  schedule: "scheduling",
+  timer: "scheduling",
+  job: "scheduling",
+
+  script: "automation",
+  automate: "automation",
+  workflow: "automation",
+
+  // Research
+  research: "research",
+  search: "research",
+  wikipedia: "research",
+  lookup: "research",
+
+  // Finance
+  finance: "finance",
+  investment: "finance",
+  stock: "finance",
+  portfolio: "finance",
+  budget: "finance",
+
+  // System
+  config: "configuration",
+  settings: "configuration",
+  setup: "configuration",
+  install: "setup",
+
+  // Writing
+  document: "documentation",
+  readme: "documentation",
+  docs: "documentation",
+  write: "writing",
+  draft: "writing",
+
+  // AI/ML
+  model: "ai-ml",
+  claude: "ai-ml",
+  openai: "ai-ml",
+  gpt: "ai-ml",
+  llm: "ai-ml",
+  prompt: "prompt-engineering",
+
+  // UI
+  dashboard: "dashboard",
+  ui: "ui-development",
+  frontend: "ui-development",
+  css: "ui-development",
+  html: "ui-development",
+  react: "ui-development",
+};
+
+/**
+ * Tokenize text into words
+ * @param {string} text - Raw text to tokenize
+ * @returns {string[]} Array of lowercase tokens
+ */
+function tokenize(text) {
+  if (!text || typeof text !== "string") return [];
+
+  return (
+    text
+      .toLowerCase()
+      // Remove code blocks
+      .replace(/```[\s\S]*?```/g, " ")
+      // Remove inline code
+      .replace(/`[^`]+`/g, " ")
+      // Remove URLs
+      .replace(/https?:\/\/\S+/g, " ")
+      // Remove special characters but keep hyphens in words
+      .replace(/[^a-z0-9\s-]/g, " ")
+      // Split on whitespace
+      .split(/\s+/)
+      // Filter valid tokens
+      .filter(
+        (token) =>
+          token.length > 2 && token.length < 30 && !STOP_WORDS.has(token) && !/^\d+$/.test(token),
+      )
+  );
+}
+
+/**
+ * Calculate term frequency for a document
+ * @param {string[]} tokens - Array of tokens
+ * @returns {Map<string, number>} Term frequency map
+ */
+function calculateTF(tokens) {
+  const tf = new Map();
+  const total = tokens.length || 1;
+
+  tokens.forEach((token) => {
+    tf.set(token, (tf.get(token) || 0) + 1);
+  });
+
+  // Normalize by document length
+  tf.forEach((count, term) => {
+    tf.set(term, count / total);
+  });
+
+  return tf;
+}
+
+/**
+ * Calculate inverse document frequency using corpus statistics
+ * For a single document, we use term rarity as a proxy
+ * @param {Map<string, number>} tf - Term frequency map
+ * @param {number} vocabSize - Size of vocabulary
+ * @returns {Map<string, number>} IDF scores
+ */
+function calculateIDF(tf, vocabSize) {
+  const idf = new Map();
+
+  tf.forEach((freq, term) => {
+    // Boost terms that appear in known patterns
+    const patternBoost = TOPIC_PATTERNS[term] ? 2.0 : 1.0;
+    // Simple IDF approximation: rarer terms get higher scores
+    const score = Math.log(vocabSize / (1 + freq * vocabSize)) * patternBoost;
+    idf.set(term, Math.max(0, score));
+  });
+
+  return idf;
+}
+
+/**
+ * Extract key terms using TF-IDF
+ * @param {string} text - Text to analyze
+ * @returns {Array<{term: string, score: number}>} Ranked terms
+ */
+function extractKeyTerms(text) {
+  const tokens = tokenize(text);
+  if (tokens.length === 0) return [];
+
+  const tf = calculateTF(tokens);
+  const idf = calculateIDF(tf, tf.size);
+
+  const tfidf = [];
+  tf.forEach((tfScore, term) => {
+    const idfScore = idf.get(term) || 0;
+    const score = tfScore * idfScore;
+
+    // Only include terms that meet minimum thresholds
+    const rawCount = tokens.filter((t) => t === term).length;
+    if (rawCount >= CONFIG.minTermFrequency && score >= CONFIG.minTermScore) {
+      tfidf.push({ term, score, count: rawCount });
+    }
+  });
+
+  // Sort by score descending
+  return tfidf.sort((a, b) => b.score - a.score);
+}
+
+/**
+ * Match text against existing topics
+ * @param {string} text - Text to match
+ * @param {string[]} existingTopics - List of existing topic names
+ * @returns {Array<{topic: string, confidence: number}>} Matched topics with confidence
+ */
+function matchTopics(text, existingTopics) {
+  const tokens = tokenize(text);
+  const matches = new Map();
+
+  // Score each existing topic
+  existingTopics.forEach((topic) => {
+    let score = 0;
+    const topicTokens = tokenize(topic);
+
+    // Direct token match
+    topicTokens.forEach((tt) => {
+      const count = tokens.filter((t) => t === tt || t.includes(tt) || tt.includes(t)).length;
+      score += count * 0.3;
+    });
+
+    // Pattern-based matching
+    tokens.forEach((token) => {
+      const mappedTopic = TOPIC_PATTERNS[token];
+      if (mappedTopic === topic) {
+        score += 0.5;
+      }
+    });
+
+    if (score > 0) {
+      // Normalize by text length (log scale to avoid penalizing long texts too much)
+      const normalizedScore = score / Math.log2(tokens.length + 2);
+      matches.set(topic, Math.min(1, normalizedScore));
+    }
+  });
+
+  // Convert to sorted array
+  return Array.from(matches.entries())
+    .map(([topic, confidence]) => ({ topic, confidence }))
+    .sort((a, b) => b.confidence - a.confidence);
+}
+
+/**
+ * Generate topic suggestions based on content
+ * @param {Array<{term: string, score: number}>} keyTerms - Key terms from text
+ * @param {string[]} existingTopics - Topics to avoid suggesting
+ * @returns {string[]} Suggested new topic names
+ */
+function generateSuggestions(keyTerms, existingTopics) {
+  const existingSet = new Set(existingTopics.map((t) => t.toLowerCase()));
+  const suggestions = new Set();
+
+  // Strategy 1: Use known patterns for top terms
+  keyTerms.slice(0, 15).forEach(({ term }) => {
+    const mapped = TOPIC_PATTERNS[term];
+    if (mapped && !existingSet.has(mapped)) {
+      suggestions.add(mapped);
+    }
+  });
+
+  // Strategy 2: Create compound topics from top co-occurring terms
+  if (keyTerms.length >= 2 && suggestions.size < CONFIG.maxSuggestions) {
+    const topTerms = keyTerms.slice(0, 5).map((t) => t.term);
+
+    // Look for related pairs
+    const pairs = [
+      ["api", "integration"],
+      ["code", "review"],
+      ["data", "analysis"],
+      ["error", "handling"],
+      ["file", "management"],
+      ["memory", "optimization"],
+      ["performance", "tuning"],
+      ["security", "audit"],
+      ["system", "design"],
+      ["user", "interface"],
+    ];
+
+    pairs.forEach(([a, b]) => {
+      if (topTerms.some((t) => t.includes(a)) && topTerms.some((t) => t.includes(b))) {
+        const compound = `${a}-${b}`;
+        if (!existingSet.has(compound)) {
+          suggestions.add(compound);
+        }
+      }
+    });
+  }
+
+  // Strategy 3: Use top-scoring term as-is if it's descriptive enough
+  if (suggestions.size < CONFIG.maxSuggestions) {
+    keyTerms.slice(0, 5).forEach(({ term, score }) => {
+      // Only use single terms that are sufficiently meaningful
+      if (score > 0.15 && term.length > 4 && !existingSet.has(term)) {
+        suggestions.add(term);
+      }
+    });
+  }
+
+  return Array.from(suggestions).slice(0, CONFIG.maxSuggestions);
+}
+
+/**
+ * Load discovered topics from state file
+ * @returns {Object} Discovered topics data
+ */
+function loadDiscoveredTopics() {
+  try {
+    if (fs.existsSync(CONFIG.discoveredTopicsPath)) {
+      return JSON.parse(fs.readFileSync(CONFIG.discoveredTopicsPath, "utf8"));
+    }
+  } catch (e) {
+    console.error("Failed to load discovered topics:", e.message);
+  }
+
+  return {
+    version: 1,
+    topics: {},
+    lastUpdated: null,
+  };
+}
+
+/**
+ * Save discovered topics to state file
+ * @param {Object} data - Topics data to save
+ */
+function saveDiscoveredTopics(data) {
+  try {
+    data.lastUpdated = new Date().toISOString();
+    fs.writeFileSync(CONFIG.discoveredTopicsPath, JSON.stringify(data, null, 2));
+  } catch (e) {
+    console.error("Failed to save discovered topics:", e.message);
+  }
+}
+
+/**
+ * Update discovered topics with new suggestions
+ * @param {string[]} suggestions - New topic suggestions
+ * @param {string} sessionKey - Source session identifier
+ */
+function updateDiscoveredTopics(suggestions, sessionKey) {
+  const data = loadDiscoveredTopics();
+
+  suggestions.forEach((topic) => {
+    if (!data.topics[topic]) {
+      data.topics[topic] = {
+        firstSeen: new Date().toISOString(),
+        occurrences: 0,
+        sessions: [],
+      };
+    }
+
+    data.topics[topic].occurrences++;
+    data.topics[topic].lastSeen = new Date().toISOString();
+
+    if (!data.topics[topic].sessions.includes(sessionKey)) {
+      data.topics[topic].sessions.push(sessionKey);
+      // Keep only last 10 sessions
+      if (data.topics[topic].sessions.length > 10) {
+        data.topics[topic].sessions.shift();
+      }
+    }
+  });
+
+  saveDiscoveredTopics(data);
+}
+
+/**
+ * Main classification function
+ * Analyzes transcript content to match existing topics and suggest new ones
+ *
+ * @param {string|Array} transcript - Session transcript (string or array of messages)
+ * @param {string[]} existingTopics - List of existing topic names
+ * @param {Object} options - Optional configuration
+ * @param {string} options.sessionKey - Session identifier for tracking
+ * @param {boolean} options.persist - Whether to persist discovered topics (default: true)
+ * @returns {{matched: Array<{topic: string, confidence: number}>, suggested: string[], keyTerms: Array}}
+ */
+function classifyAndSuggestTopics(transcript, existingTopics = [], options = {}) {
+  // Normalize transcript to text
+  let text = "";
+  if (Array.isArray(transcript)) {
+    text = transcript
+      .map((entry) => {
+        if (typeof entry === "string") return entry;
+        if (entry.text) return entry.text;
+        if (entry.message?.content) {
+          const content = entry.message.content;
+          if (typeof content === "string") return content;
+          if (Array.isArray(content)) {
+            return content
+              .filter((c) => c.type === "text")
+              .map((c) => c.text || "")
+              .join(" ");
+          }
+        }
+        return "";
+      })
+      .join("\n");
+  } else if (typeof transcript === "string") {
+    text = transcript;
+  }
+
+  if (!text || text.length < 20) {
+    return { matched: [], suggested: [], keyTerms: [] };
+  }
+
+  // Extract key terms
+  const keyTerms = extractKeyTerms(text);
+
+  // Match against existing topics
+  const matched = matchTopics(text, existingTopics);
+
+  // Determine if we need suggestions
+  const bestMatch = matched[0];
+  const needsSuggestions = !bestMatch || bestMatch.confidence < CONFIG.matchThreshold;
+
+  let suggested = [];
+  if (needsSuggestions) {
+    suggested = generateSuggestions(keyTerms, existingTopics);
+
+    // Persist discovered topics if enabled
+    if (options.persist !== false && suggested.length > 0 && options.sessionKey) {
+      updateDiscoveredTopics(suggested, options.sessionKey);
+    }
+  }
+
+  return {
+    matched: matched.slice(0, 5),
+    suggested,
+    keyTerms: keyTerms.slice(0, 10),
+    confidence: bestMatch?.confidence || 0,
+  };
+}
+
+/**
+ * Get all discovered topics sorted by occurrence
+ * @returns {Array<{name: string, occurrences: number, sessions: number}>}
+ */
+function getDiscoveredTopics() {
+  const data = loadDiscoveredTopics();
+
+  return Object.entries(data.topics)
+    .map(([name, info]) => ({
+      name,
+      occurrences: info.occurrences,
+      sessions: info.sessions?.length || 0,
+      firstSeen: info.firstSeen,
+      lastSeen: info.lastSeen,
+    }))
+    .sort((a, b) => b.occurrences - a.occurrences);
+}
+
+/**
+ * Promote a discovered topic to the official topic list
+ * Returns the topic data for external handling
+ * @param {string} topicName - Topic to promote
+ * @returns {Object|null} Topic data or null if not found
+ */
+function promoteDiscoveredTopic(topicName) {
+  const data = loadDiscoveredTopics();
+
+  if (data.topics[topicName]) {
+    const topicData = { ...data.topics[topicName], name: topicName };
+    delete data.topics[topicName];
+    saveDiscoveredTopics(data);
+    return topicData;
+  }
+
+  return null;
+}
+
+// Export public API
+module.exports = {
+  classifyAndSuggestTopics,
+  getDiscoveredTopics,
+  promoteDiscoveredTopic,
+  extractKeyTerms,
+  matchTopics,
+  // Export config for testing/tuning
+  CONFIG,
+  TOPIC_PATTERNS,
+};