Initial commit with translated description
This commit is contained in:
277
index.js
Normal file
277
index.js
Normal file
@@ -0,0 +1,277 @@
|
||||
/**
|
||||
* PDF-Text-Extractor - Extract text from PDFs with OCR support
|
||||
* Vernox v1.0 - Autonomous Revenue Agent
|
||||
*/
|
||||
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
|
||||
// Load configuration
|
||||
const configPath = path.join(__dirname, 'config.json');
|
||||
const config = JSON.parse(fs.readFileSync(configPath, 'utf8'));
|
||||
|
||||
// PDF.js will be loaded dynamically
|
||||
let pdfjs = null;
|
||||
|
||||
/**
|
||||
* Extract text from a single PDF file
|
||||
*/
|
||||
function extractText(params) {
|
||||
const { pdfPath, options = {} } = params;
|
||||
|
||||
if (!pdfPath) {
|
||||
throw new Error('pdfPath is required');
|
||||
}
|
||||
|
||||
// Lazy load PDF.js (only when needed)
|
||||
if (!pdfjs) {
|
||||
try {
|
||||
pdfjs = require('pdfjs-dist');
|
||||
} catch (e) {
|
||||
throw new Error('PDF.js not available. Install with: npm install pdfjs-dist');
|
||||
}
|
||||
}
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
const fileData = fs.readFileSync(pdfPath);
|
||||
const loadingTask = pdfjs.getDocument(fileData);
|
||||
|
||||
loadingTask.promise.then((pdf) => {
|
||||
const pages = pdf.numPages;
|
||||
let fullText = '';
|
||||
let pageCount = 0;
|
||||
|
||||
const processPage = (pageNum) => {
|
||||
return pdf.getPage(pageNum).then((page) => {
|
||||
return page.getTextContent();
|
||||
}).then((textContent) => {
|
||||
const text = textContent.items.map(item => item.str).join(' ');
|
||||
fullText += text + '\n\n';
|
||||
pageCount++;
|
||||
|
||||
if (pageCount === pages) {
|
||||
// All pages processed
|
||||
const wordCount = countWords(fullText);
|
||||
const charCount = fullText.length;
|
||||
const detectedLang = detectLanguage(fullText);
|
||||
const method = options.ocr ? 'ocr' : 'text';
|
||||
|
||||
resolve({
|
||||
text: fullText,
|
||||
pages: pages,
|
||||
wordCount: wordCount,
|
||||
charCount: charCount,
|
||||
language: detectedLang,
|
||||
method: method,
|
||||
metadata: {
|
||||
title: pdf.info?.Title || '',
|
||||
author: pdf.info?.Author || '',
|
||||
creationDate: pdf.info?.CreationDate || '',
|
||||
creator: pdf.info?.Creator || ''
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
};
|
||||
|
||||
// Process all pages
|
||||
for (let i = 1; i <= pages; i++) {
|
||||
processPage(i);
|
||||
}
|
||||
|
||||
}).catch((error) => {
|
||||
reject({
|
||||
error: `PDF parsing failed: ${error.message}`,
|
||||
suggestion: 'Check if file is a valid PDF'
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract text from multiple PDF files at once
|
||||
*/
|
||||
function extractBatch(params) {
|
||||
const { pdfFiles, options = {} } = params;
|
||||
|
||||
if (!pdfFiles || !Array.isArray(pdfFiles)) {
|
||||
throw new Error('pdfFiles must be an array of file paths');
|
||||
}
|
||||
|
||||
const results = [];
|
||||
const errors = [];
|
||||
let successCount = 0;
|
||||
let failureCount = 0;
|
||||
let totalPages = 0;
|
||||
|
||||
const processOne = (pdfPath) => {
|
||||
return extractText({ pdfPath, options })
|
||||
.then((result) => {
|
||||
results.push(result);
|
||||
successCount++;
|
||||
totalPages += result.pages;
|
||||
})
|
||||
.catch((error) => {
|
||||
errors.push({
|
||||
file: pdfPath,
|
||||
error: error.message || error
|
||||
});
|
||||
failureCount++;
|
||||
});
|
||||
};
|
||||
|
||||
// Process files in batches (configurable concurrency)
|
||||
const batchSize = config.batch?.maxConcurrent || 3;
|
||||
const batches = [];
|
||||
for (let i = 0; i < pdfFiles.length; i += batchSize) {
|
||||
batches.push(pdfFiles.slice(i, i + batchSize));
|
||||
}
|
||||
|
||||
return batches.reduce((chain, batch) => {
|
||||
return chain.then(() => Promise.all(batch.map(processOne)));
|
||||
}, Promise.resolve())
|
||||
.then(() => {
|
||||
return {
|
||||
results,
|
||||
totalPages,
|
||||
successCount,
|
||||
failureCount,
|
||||
errors
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Count words in text
|
||||
*/
|
||||
function countWords(params) {
|
||||
const { text, options = {} } = params;
|
||||
const {
|
||||
minWordLength = 3,
|
||||
excludeNumbers = false,
|
||||
countByPage = false
|
||||
} = options;
|
||||
|
||||
// Split into words
|
||||
const pages = text.split(/\n\n/); // Assume double newline is page break
|
||||
let totalWords = 0;
|
||||
const pageCounts = [];
|
||||
|
||||
pages.forEach((page, index) => {
|
||||
// Remove extra whitespace, split by spaces
|
||||
const words = page.trim()
|
||||
.replace(/\s+/g, ' ')
|
||||
.split(' ')
|
||||
.filter(word => {
|
||||
if (excludeNumbers) {
|
||||
// Check if word is mostly numbers
|
||||
const numericChars = word.replace(/[^0-9]/g, '').length;
|
||||
return word.length - numericChars >= minWordLength;
|
||||
}
|
||||
return word.length >= minWordLength;
|
||||
});
|
||||
|
||||
const pageCount = words.length;
|
||||
pageCounts.push(pageCount);
|
||||
totalWords += pageCount;
|
||||
});
|
||||
|
||||
if (countByPage) {
|
||||
return {
|
||||
wordCount: totalWords,
|
||||
charCount: text.length,
|
||||
pageCounts: pageCounts,
|
||||
averageWordsPerPage: totalWords / pageCounts.length
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
wordCount: totalWords,
|
||||
charCount: text.length
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Detect language of text (simple heuristic)
|
||||
*/
|
||||
function detectLanguage(text) {
|
||||
if (!text || text.length < 50) {
|
||||
return { language: 'unknown', languageName: 'Unknown', confidence: 0 };
|
||||
}
|
||||
|
||||
// Simple frequency analysis for common languages
|
||||
const langPatterns = {
|
||||
'English': /\b(the|and|is|of|to|in)\b/i,
|
||||
'Spanish': /\b(el|la|los|las|en|un|una|una|os|que|de|del|al|con)\b/i,
|
||||
'French': /\b(le|la|les|des|de|du|un|une|que|et|en)\b/i,
|
||||
'German': /\b(der|die|das|dem|den|ein|eine|einem|und|ich|hat|was|ist)\b/i
|
||||
};
|
||||
|
||||
let detectedLang = 'unknown';
|
||||
let maxScore = 0;
|
||||
|
||||
for (const [lang, pattern] of Object.entries(langPatterns)) {
|
||||
const matches = (text.match(pattern) || []).length;
|
||||
if (matches > maxScore) {
|
||||
maxScore = matches;
|
||||
detectedLang = lang;
|
||||
}
|
||||
}
|
||||
|
||||
const confidence = Math.min(100, Math.round((maxScore / 100) * 100));
|
||||
|
||||
const langNames = {
|
||||
'English': 'English',
|
||||
'Spanish': 'Spanish',
|
||||
'French': 'French',
|
||||
'German': 'German'
|
||||
};
|
||||
|
||||
return {
|
||||
language: detectedLang,
|
||||
languageName: langNames[detectedLang] || 'Unknown',
|
||||
confidence: confidence
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Main function - handles tool invocations
|
||||
*/
|
||||
function main(action, params) {
|
||||
switch (action) {
|
||||
case 'extractText':
|
||||
return extractText(params);
|
||||
|
||||
case 'extractBatch':
|
||||
return extractBatch(params);
|
||||
|
||||
case 'countWords':
|
||||
return countWords(params);
|
||||
|
||||
case 'detectLanguage':
|
||||
return detectLanguage(params.text);
|
||||
|
||||
default:
|
||||
throw new Error(`Unknown action: ${action}`);
|
||||
}
|
||||
}
|
||||
|
||||
// CLI interface
|
||||
if (require.main === module) {
|
||||
const args = process.argv.slice(2);
|
||||
const action = args[0];
|
||||
|
||||
try {
|
||||
const params = JSON.parse(args[1] || '{}');
|
||||
const result = main(action, params);
|
||||
console.log(JSON.stringify(result, null, 2));
|
||||
} catch (error) {
|
||||
console.error(JSON.stringify({
|
||||
error: error.message || error,
|
||||
suggestion: 'Check your parameters and try again'
|
||||
}, null, 2));
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { main, extractText, extractBatch, countWords, detectLanguage };
|
||||
Reference in New Issue
Block a user