2024-03-22 16:57:09 +08:00
|
|
|
export interface IChangeParserConfigRequestBody {
|
2025-12-17 12:58:48 +08:00
|
|
|
pages?: number[][];
|
|
|
|
|
chunk_token_num?: number;
|
|
|
|
|
layout_recognize?: string;
|
|
|
|
|
task_page_size?: number;
|
|
|
|
|
delimiter?: string;
|
|
|
|
|
auto_keywords?: number;
|
|
|
|
|
auto_questions?: number;
|
|
|
|
|
html4excel?: boolean;
|
|
|
|
|
toc_extraction?: boolean;
|
Refa: improve image table context (#12244)
### What problem does this PR solve?
Improve image table context.
Current strategy in attach_media_context:
- Order by position when possible: if any chunk has page/position info,
sort by (page, top, left), otherwise keep original order.
- Apply only to media chunks: images use image_context_size, tables use
table_context_size.
- Primary matching: on the same page, choose a text chunk whose vertical
span overlaps the media, then pick the one with the closest vertical
midpoint.
- Fallback matching: if no overlap on that page, choose the nearest text
chunk on the same page (page-head uses the next text; page-tail uses the
previous text).
- Context extraction: inside the chosen text chunk, find a mid-sentence
boundary near the text midpoint, then take context_size tokens split
before/after (total budget).
- No multi-chunk stitching: context comes from a single text chunk to
avoid mixing unrelated segments.
### Type of change
- [x] Refactoring
---------
Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
2025-12-26 17:55:32 +08:00
|
|
|
image_table_context_window?: number;
|
|
|
|
|
image_context_size?: number;
|
|
|
|
|
table_context_size?: number;
|
2026-05-11 15:42:31 -10:00
|
|
|
raptor?: {
|
|
|
|
|
use_raptor?: boolean;
|
|
|
|
|
prompt?: string;
|
|
|
|
|
max_token?: number;
|
|
|
|
|
threshold?: number;
|
|
|
|
|
max_cluster?: number;
|
|
|
|
|
random_seed?: number;
|
|
|
|
|
scope?: string;
|
|
|
|
|
clustering_method?: 'gmm' | 'ahc';
|
|
|
|
|
tree_builder?: 'raptor' | 'psi';
|
|
|
|
|
};
|
2026-04-27 23:42:57 +08:00
|
|
|
// Metadata fields
|
|
|
|
|
metadata?: Array<{
|
|
|
|
|
key?: string;
|
|
|
|
|
description?: string;
|
|
|
|
|
enum?: string[];
|
|
|
|
|
}>;
|
|
|
|
|
built_in_metadata?: Array<{
|
|
|
|
|
key?: string;
|
|
|
|
|
description?: string;
|
|
|
|
|
enum?: string[];
|
|
|
|
|
}>;
|
|
|
|
|
enable_metadata?: boolean;
|
2024-03-22 16:57:09 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export interface IChangeParserRequestBody {
|
|
|
|
|
parser_id: string;
|
2026-05-11 15:42:31 -10:00
|
|
|
pipeline_id?: string;
|
|
|
|
|
doc_id?: string;
|
2024-03-22 16:57:09 +08:00
|
|
|
parser_config: IChangeParserConfigRequestBody;
|
|
|
|
|
}
|
2025-01-13 17:13:37 +08:00
|
|
|
|
|
|
|
|
export interface IDocumentMetaRequestBody {
|
|
|
|
|
documentId: string;
|
|
|
|
|
meta: string; // json format string
|
|
|
|
|
}
|