/** * PDF-Text-Extractor - Extract text from PDFs with OCR support * Vernox v1.0 - Autonomous Revenue Agent */ const fs = require('fs'); const path = require('path'); // Load configuration const configPath = path.join(__dirname, 'config.json'); const config = JSON.parse(fs.readFileSync(configPath, 'utf8')); // PDF.js will be loaded dynamically let pdfjs = null; /** * Extract text from a single PDF file */ function extractText(params) { const { pdfPath, options = {} } = params; if (!pdfPath) { throw new Error('pdfPath is required'); } // Lazy load PDF.js (only when needed) if (!pdfjs) { try { pdfjs = require('pdfjs-dist'); } catch (e) { throw new Error('PDF.js not available. Install with: npm install pdfjs-dist'); } } return new Promise((resolve, reject) => { const fileData = fs.readFileSync(pdfPath); const loadingTask = pdfjs.getDocument(fileData); loadingTask.promise.then((pdf) => { const pages = pdf.numPages; let fullText = ''; let pageCount = 0; const processPage = (pageNum) => { return pdf.getPage(pageNum).then((page) => { return page.getTextContent(); }).then((textContent) => { const text = textContent.items.map(item => item.str).join(' '); fullText += text + '\n\n'; pageCount++; if (pageCount === pages) { // All pages processed const wordCount = countWords(fullText); const charCount = fullText.length; const detectedLang = detectLanguage(fullText); const method = options.ocr ? 'ocr' : 'text'; resolve({ text: fullText, pages: pages, wordCount: wordCount, charCount: charCount, language: detectedLang, method: method, metadata: { title: pdf.info?.Title || '', author: pdf.info?.Author || '', creationDate: pdf.info?.CreationDate || '', creator: pdf.info?.Creator || '' } }); } }); }; // Process all pages for (let i = 1; i <= pages; i++) { processPage(i); } }).catch((error) => { reject({ error: `PDF parsing failed: ${error.message}`, suggestion: 'Check if file is a valid PDF' }); }); }); } /** * Extract text from multiple PDF files at once */ function extractBatch(params) { const { pdfFiles, options = {} } = params; if (!pdfFiles || !Array.isArray(pdfFiles)) { throw new Error('pdfFiles must be an array of file paths'); } const results = []; const errors = []; let successCount = 0; let failureCount = 0; let totalPages = 0; const processOne = (pdfPath) => { return extractText({ pdfPath, options }) .then((result) => { results.push(result); successCount++; totalPages += result.pages; }) .catch((error) => { errors.push({ file: pdfPath, error: error.message || error }); failureCount++; }); }; // Process files in batches (configurable concurrency) const batchSize = config.batch?.maxConcurrent || 3; const batches = []; for (let i = 0; i < pdfFiles.length; i += batchSize) { batches.push(pdfFiles.slice(i, i + batchSize)); } return batches.reduce((chain, batch) => { return chain.then(() => Promise.all(batch.map(processOne))); }, Promise.resolve()) .then(() => { return { results, totalPages, successCount, failureCount, errors }; }); } /** * Count words in text */ function countWords(params) { const { text, options = {} } = params; const { minWordLength = 3, excludeNumbers = false, countByPage = false } = options; // Split into words const pages = text.split(/\n\n/); // Assume double newline is page break let totalWords = 0; const pageCounts = []; pages.forEach((page, index) => { // Remove extra whitespace, split by spaces const words = page.trim() .replace(/\s+/g, ' ') .split(' ') .filter(word => { if (excludeNumbers) { // Check if word is mostly numbers const numericChars = word.replace(/[^0-9]/g, '').length; return word.length - numericChars >= minWordLength; } return word.length >= minWordLength; }); const pageCount = words.length; pageCounts.push(pageCount); totalWords += pageCount; }); if (countByPage) { return { wordCount: totalWords, charCount: text.length, pageCounts: pageCounts, averageWordsPerPage: totalWords / pageCounts.length }; } return { wordCount: totalWords, charCount: text.length }; } /** * Detect language of text (simple heuristic) */ function detectLanguage(text) { if (!text || text.length < 50) { return { language: 'unknown', languageName: 'Unknown', confidence: 0 }; } // Simple frequency analysis for common languages const langPatterns = { 'English': /\b(the|and|is|of|to|in)\b/i, 'Spanish': /\b(el|la|los|las|en|un|una|una|os|que|de|del|al|con)\b/i, 'French': /\b(le|la|les|des|de|du|un|une|que|et|en)\b/i, 'German': /\b(der|die|das|dem|den|ein|eine|einem|und|ich|hat|was|ist)\b/i }; let detectedLang = 'unknown'; let maxScore = 0; for (const [lang, pattern] of Object.entries(langPatterns)) { const matches = (text.match(pattern) || []).length; if (matches > maxScore) { maxScore = matches; detectedLang = lang; } } const confidence = Math.min(100, Math.round((maxScore / 100) * 100)); const langNames = { 'English': 'English', 'Spanish': 'Spanish', 'French': 'French', 'German': 'German' }; return { language: detectedLang, languageName: langNames[detectedLang] || 'Unknown', confidence: confidence }; } /** * Main function - handles tool invocations */ function main(action, params) { switch (action) { case 'extractText': return extractText(params); case 'extractBatch': return extractBatch(params); case 'countWords': return countWords(params); case 'detectLanguage': return detectLanguage(params.text); default: throw new Error(`Unknown action: ${action}`); } } // CLI interface if (require.main === module) { const args = process.argv.slice(2); const action = args[0]; try { const params = JSON.parse(args[1] || '{}'); const result = main(action, params); console.log(JSON.stringify(result, null, 2)); } catch (error) { console.error(JSON.stringify({ error: error.message || error, suggestion: 'Check your parameters and try again' }, null, 2)); process.exit(1); } } module.exports = { main, extractText, extractBatch, countWords, detectLanguage };