jontsai_command-center/scripts/topic-classifier.js

/**
 * Topic Classifier for OpenClaw Sessions
 *
 * Analyzes session transcript content to:
 * - Match against existing topics
 * - Detect when existing topics don't fit well
 * - Suggest new topic names based on content patterns
 * - Maintain a discovered-topics.json file for learned topics
 *
 * @module topic-classifier
 */

const fs = require("fs");
const path = require("path");
const { CONFIG: APP_CONFIG } = require("../src/config");

// Default config
const CONFIG = {
  // Minimum TF-IDF score to consider a term significant
  minTermScore: 0.1,
  // Minimum topic match confidence to consider a match "good"
  matchThreshold: 0.3,
  // Minimum occurrences for a term to be considered
  minTermFrequency: 2,
  // Path to discovered topics state file
  discoveredTopicsPath: path.join(APP_CONFIG.paths.state, "discovered-topics.json"),
  // Maximum suggested topics per classification
  maxSuggestions: 3,
};

// Stop words to filter out (common English words)
const STOP_WORDS = new Set([
  "a",
  "an",
  "the",
  "and",
  "or",
  "but",
  "in",
  "on",
  "at",
  "to",
  "for",
  "of",
  "with",
  "by",
  "from",
  "up",
  "about",
  "into",
  "through",
  "during",
  "before",
  "after",
  "above",
  "below",
  "between",
  "under",
  "again",
  "further",
  "then",
  "once",
  "here",
  "there",
  "when",
  "where",
  "why",
  "how",
  "all",
  "each",
  "few",
  "more",
  "most",
  "other",
  "some",
  "such",
  "no",
  "nor",
  "not",
  "only",
  "own",
  "same",
  "so",
  "than",
  "too",
  "very",
  "s",
  "t",
  "can",
  "will",
  "just",
  "don",
  "should",
  "now",
  "i",
  "me",
  "my",
  "myself",
  "we",
  "our",
  "ours",
  "you",
  "your",
  "yours",
  "he",
  "him",
  "his",
  "she",
  "her",
  "hers",
  "it",
  "its",
  "they",
  "them",
  "their",
  "theirs",
  "what",
  "which",
  "who",
  "whom",
  "this",
  "that",
  "these",
  "those",
  "am",
  "is",
  "are",
  "was",
  "were",
  "be",
  "been",
  "being",
  "have",
  "has",
  "had",
  "having",
  "do",
  "does",
  "did",
  "doing",
  "would",
  "could",
  "ought",
  "let",
  "like",
  "need",
  "want",
  "got",
  "get",
  "make",
  "made",
  "see",
  "look",
  "think",
  "know",
  "take",
  "come",
  "go",
  "say",
  "said",
  "tell",
  "told",
  "ask",
  "use",
  "used",
  "find",
  "give",
  "gave",
  "yes",
  "no",
  "ok",
  "okay",
  "yeah",
  "sure",
  "right",
  "well",
  "also",
  "just",
  "really",
  "actually",
  "basically",
  "probably",
  "maybe",
  // Tech-common words that are too generic
  "file",
  "code",
  "run",
  "check",
  "help",
  "please",
  "thanks",
  "hello",
  "hi",
  "hey",
  "good",
  "great",
  "nice",
  "cool",
  "awesome",
  "perfect",
]);

// Known topic patterns for seeding - maps keywords to topic names
const TOPIC_PATTERNS = {
  // Development
  git: "version-control",
  github: "version-control",
  commit: "version-control",
  branch: "version-control",
  merge: "version-control",
  pull: "version-control",
  push: "version-control",

  debug: "debugging",
  error: "debugging",
  bug: "debugging",
  fix: "debugging",
  stack: "debugging",
  trace: "debugging",
  exception: "debugging",

  test: "testing",
  unittest: "testing",
  jest: "testing",
  pytest: "testing",
  coverage: "testing",

  deploy: "deployment",
  production: "deployment",
  staging: "deployment",
  ci: "deployment",
  cd: "deployment",
  pipeline: "deployment",

  api: "api-integration",
  endpoint: "api-integration",
  rest: "api-integration",
  graphql: "api-integration",
  webhook: "api-integration",

  database: "database",
  sql: "database",
  postgres: "database",
  mysql: "database",
  mongodb: "database",
  query: "database",

  docker: "containers",
  kubernetes: "containers",
  k8s: "containers",
  container: "containers",
  pod: "containers",

  aws: "cloud-infra",
  gcp: "cloud-infra",
  azure: "cloud-infra",
  terraform: "cloud-infra",
  cloudformation: "cloud-infra",

  // Communication
  slack: "slack-integration",
  channel: "slack-integration",
  message: "messaging",
  email: "email",
  notification: "notifications",

  // Automation
  cron: "scheduling",
  schedule: "scheduling",
  timer: "scheduling",
  job: "scheduling",

  script: "automation",
  automate: "automation",
  workflow: "automation",

  // Research
  research: "research",
  search: "research",
  wikipedia: "research",
  lookup: "research",

  // Finance
  finance: "finance",
  investment: "finance",
  stock: "finance",
  portfolio: "finance",
  budget: "finance",

  // System
  config: "configuration",
  settings: "configuration",
  setup: "configuration",
  install: "setup",

  // Writing
  document: "documentation",
  readme: "documentation",
  docs: "documentation",
  write: "writing",
  draft: "writing",

  // AI/ML
  model: "ai-ml",
  claude: "ai-ml",
  openai: "ai-ml",
  gpt: "ai-ml",
  llm: "ai-ml",
  prompt: "prompt-engineering",

  // UI
  dashboard: "dashboard",
  ui: "ui-development",
  frontend: "ui-development",
  css: "ui-development",
  html: "ui-development",
  react: "ui-development",
};

/**
 * Tokenize text into words
 * @param {string} text - Raw text to tokenize
 * @returns {string[]} Array of lowercase tokens
 */
function tokenize(text) {
  if (!text || typeof text !== "string") return [];

  return (
    text
      .toLowerCase()
      // Remove code blocks
      .replace(/```[\s\S]*?```/g, " ")
      // Remove inline code
      .replace(/`[^`]+`/g, " ")
      // Remove URLs
      .replace(/https?:\/\/\S+/g, " ")
      // Remove special characters but keep hyphens in words
      .replace(/[^a-z0-9\s-]/g, " ")
      // Split on whitespace
      .split(/\s+/)
      // Filter valid tokens
      .filter(
        (token) =>
          token.length > 2 && token.length < 30 && !STOP_WORDS.has(token) && !/^\d+$/.test(token),
      )
  );
}

/**
 * Calculate term frequency for a document
 * @param {string[]} tokens - Array of tokens
 * @returns {Map<string, number>} Term frequency map
 */
function calculateTF(tokens) {
  const tf = new Map();
  const total = tokens.length || 1;

  tokens.forEach((token) => {
    tf.set(token, (tf.get(token) || 0) + 1);
  });

  // Normalize by document length
  tf.forEach((count, term) => {
    tf.set(term, count / total);
  });

  return tf;
}

/**
 * Calculate inverse document frequency using corpus statistics
 * For a single document, we use term rarity as a proxy
 * @param {Map<string, number>} tf - Term frequency map
 * @param {number} vocabSize - Size of vocabulary
 * @returns {Map<string, number>} IDF scores
 */
function calculateIDF(tf, vocabSize) {
  const idf = new Map();

  tf.forEach((freq, term) => {
    // Boost terms that appear in known patterns
    const patternBoost = TOPIC_PATTERNS[term] ? 2.0 : 1.0;
    // Simple IDF approximation: rarer terms get higher scores
    const score = Math.log(vocabSize / (1 + freq * vocabSize)) * patternBoost;
    idf.set(term, Math.max(0, score));
  });

  return idf;
}

/**
 * Extract key terms using TF-IDF
 * @param {string} text - Text to analyze
 * @returns {Array<{term: string, score: number}>} Ranked terms
 */
function extractKeyTerms(text) {
  const tokens = tokenize(text);
  if (tokens.length === 0) return [];

  const tf = calculateTF(tokens);
  const idf = calculateIDF(tf, tf.size);

  const tfidf = [];
  tf.forEach((tfScore, term) => {
    const idfScore = idf.get(term) || 0;
    const score = tfScore * idfScore;

    // Only include terms that meet minimum thresholds
    const rawCount = tokens.filter((t) => t === term).length;
    if (rawCount >= CONFIG.minTermFrequency && score >= CONFIG.minTermScore) {
      tfidf.push({ term, score, count: rawCount });
    }
  });

  // Sort by score descending
  return tfidf.sort((a, b) => b.score - a.score);
}

/**
 * Match text against existing topics
 * @param {string} text - Text to match
 * @param {string[]} existingTopics - List of existing topic names
 * @returns {Array<{topic: string, confidence: number}>} Matched topics with confidence
 */
function matchTopics(text, existingTopics) {
  const tokens = tokenize(text);
  const matches = new Map();

  // Score each existing topic
  existingTopics.forEach((topic) => {
    let score = 0;
    const topicTokens = tokenize(topic);

    // Direct token match
    topicTokens.forEach((tt) => {
      const count = tokens.filter((t) => t === tt || t.includes(tt) || tt.includes(t)).length;
      score += count * 0.3;
    });

    // Pattern-based matching
    tokens.forEach((token) => {
      const mappedTopic = TOPIC_PATTERNS[token];
      if (mappedTopic === topic) {
        score += 0.5;
      }
    });

    if (score > 0) {
      // Normalize by text length (log scale to avoid penalizing long texts too much)
      const normalizedScore = score / Math.log2(tokens.length + 2);
      matches.set(topic, Math.min(1, normalizedScore));
    }
  });

  // Convert to sorted array
  return Array.from(matches.entries())
    .map(([topic, confidence]) => ({ topic, confidence }))
    .sort((a, b) => b.confidence - a.confidence);
}

/**
 * Generate topic suggestions based on content
 * @param {Array<{term: string, score: number}>} keyTerms - Key terms from text
 * @param {string[]} existingTopics - Topics to avoid suggesting
 * @returns {string[]} Suggested new topic names
 */
function generateSuggestions(keyTerms, existingTopics) {
  const existingSet = new Set(existingTopics.map((t) => t.toLowerCase()));
  const suggestions = new Set();

  // Strategy 1: Use known patterns for top terms
  keyTerms.slice(0, 15).forEach(({ term }) => {
    const mapped = TOPIC_PATTERNS[term];
    if (mapped && !existingSet.has(mapped)) {
      suggestions.add(mapped);
    }
  });

  // Strategy 2: Create compound topics from top co-occurring terms
  if (keyTerms.length >= 2 && suggestions.size < CONFIG.maxSuggestions) {
    const topTerms = keyTerms.slice(0, 5).map((t) => t.term);

    // Look for related pairs
    const pairs = [
      ["api", "integration"],
      ["code", "review"],
      ["data", "analysis"],
      ["error", "handling"],
      ["file", "management"],
      ["memory", "optimization"],
      ["performance", "tuning"],
      ["security", "audit"],
      ["system", "design"],
      ["user", "interface"],
    ];

    pairs.forEach(([a, b]) => {
      if (topTerms.some((t) => t.includes(a)) && topTerms.some((t) => t.includes(b))) {
        const compound = `${a}-${b}`;
        if (!existingSet.has(compound)) {
          suggestions.add(compound);
        }
      }
    });
  }

  // Strategy 3: Use top-scoring term as-is if it's descriptive enough
  if (suggestions.size < CONFIG.maxSuggestions) {
    keyTerms.slice(0, 5).forEach(({ term, score }) => {
      // Only use single terms that are sufficiently meaningful
      if (score > 0.15 && term.length > 4 && !existingSet.has(term)) {
        suggestions.add(term);
      }
    });
  }

  return Array.from(suggestions).slice(0, CONFIG.maxSuggestions);
}

/**
 * Load discovered topics from state file
 * @returns {Object} Discovered topics data
 */
function loadDiscoveredTopics() {
  try {
    if (fs.existsSync(CONFIG.discoveredTopicsPath)) {
      return JSON.parse(fs.readFileSync(CONFIG.discoveredTopicsPath, "utf8"));
    }
  } catch (e) {
    console.error("Failed to load discovered topics:", e.message);
  }

  return {
    version: 1,
    topics: {},
    lastUpdated: null,
  };
}

/**
 * Save discovered topics to state file
 * @param {Object} data - Topics data to save
 */
function saveDiscoveredTopics(data) {
  try {
    data.lastUpdated = new Date().toISOString();
    fs.writeFileSync(CONFIG.discoveredTopicsPath, JSON.stringify(data, null, 2));
  } catch (e) {
    console.error("Failed to save discovered topics:", e.message);
  }
}

/**
 * Update discovered topics with new suggestions
 * @param {string[]} suggestions - New topic suggestions
 * @param {string} sessionKey - Source session identifier
 */
function updateDiscoveredTopics(suggestions, sessionKey) {
  const data = loadDiscoveredTopics();

  suggestions.forEach((topic) => {
    if (!data.topics[topic]) {
      data.topics[topic] = {
        firstSeen: new Date().toISOString(),
        occurrences: 0,
        sessions: [],
      };
    }

    data.topics[topic].occurrences++;
    data.topics[topic].lastSeen = new Date().toISOString();

    if (!data.topics[topic].sessions.includes(sessionKey)) {
      data.topics[topic].sessions.push(sessionKey);
      // Keep only last 10 sessions
      if (data.topics[topic].sessions.length > 10) {
        data.topics[topic].sessions.shift();
      }
    }
  });

  saveDiscoveredTopics(data);
}

/**
 * Main classification function
 * Analyzes transcript content to match existing topics and suggest new ones
 *
 * @param {string|Array} transcript - Session transcript (string or array of messages)
 * @param {string[]} existingTopics - List of existing topic names
 * @param {Object} options - Optional configuration
 * @param {string} options.sessionKey - Session identifier for tracking
 * @param {boolean} options.persist - Whether to persist discovered topics (default: true)
 * @returns {{matched: Array<{topic: string, confidence: number}>, suggested: string[], keyTerms: Array}}
 */
function classifyAndSuggestTopics(transcript, existingTopics = [], options = {}) {
  // Normalize transcript to text
  let text = "";
  if (Array.isArray(transcript)) {
    text = transcript
      .map((entry) => {
        if (typeof entry === "string") return entry;
        if (entry.text) return entry.text;
        if (entry.message?.content) {
          const content = entry.message.content;
          if (typeof content === "string") return content;
          if (Array.isArray(content)) {
            return content
              .filter((c) => c.type === "text")
              .map((c) => c.text || "")
              .join(" ");
          }
        }
        return "";
      })
      .join("\n");
  } else if (typeof transcript === "string") {
    text = transcript;
  }

  if (!text || text.length < 20) {
    return { matched: [], suggested: [], keyTerms: [] };
  }

  // Extract key terms
  const keyTerms = extractKeyTerms(text);

  // Match against existing topics
  const matched = matchTopics(text, existingTopics);

  // Determine if we need suggestions
  const bestMatch = matched[0];
  const needsSuggestions = !bestMatch || bestMatch.confidence < CONFIG.matchThreshold;

  let suggested = [];
  if (needsSuggestions) {
    suggested = generateSuggestions(keyTerms, existingTopics);

    // Persist discovered topics if enabled
    if (options.persist !== false && suggested.length > 0 && options.sessionKey) {
      updateDiscoveredTopics(suggested, options.sessionKey);
    }
  }

  return {
    matched: matched.slice(0, 5),
    suggested,
    keyTerms: keyTerms.slice(0, 10),
    confidence: bestMatch?.confidence || 0,
  };
}

/**
 * Get all discovered topics sorted by occurrence
 * @returns {Array<{name: string, occurrences: number, sessions: number}>}
 */
function getDiscoveredTopics() {
  const data = loadDiscoveredTopics();

  return Object.entries(data.topics)
    .map(([name, info]) => ({
      name,
      occurrences: info.occurrences,
      sessions: info.sessions?.length || 0,
      firstSeen: info.firstSeen,
      lastSeen: info.lastSeen,
    }))
    .sort((a, b) => b.occurrences - a.occurrences);
}

/**
 * Promote a discovered topic to the official topic list
 * Returns the topic data for external handling
 * @param {string} topicName - Topic to promote
 * @returns {Object|null} Topic data or null if not found
 */
function promoteDiscoveredTopic(topicName) {
  const data = loadDiscoveredTopics();

  if (data.topics[topicName]) {
    const topicData = { ...data.topics[topicName], name: topicName };
    delete data.topics[topicName];
    saveDiscoveredTopics(data);
    return topicData;
  }

  return null;
}

// Export public API
module.exports = {
  classifyAndSuggestTopics,
  getDiscoveredTopics,
  promoteDiscoveredTopic,
  extractKeyTerms,
  matchTopics,
  // Export config for testing/tuning
  CONFIG,
  TOPIC_PATTERNS,
};