From 8723c3aa86476461a8821ba53e971cf9ffa4adaf Mon Sep 17 00:00:00 2001 From: Magicbook1108 Date: Tue, 14 Apr 2026 10:00:55 +0800 Subject: [PATCH] Feat: more templates (#14075) ### What problem does this PR solve? Feat: more templates image ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> --- agent/templates/ingestion_pipeline_Book.json | 572 +++++++++++++++++ .../templates/ingestion_pipeline_General.json | 446 +++++++++++++ agent/templates/ingestion_pipeline_Laws.json | 600 ++++++++++++++++++ .../templates/ingestion_pipeline_Manual.json | 552 ++++++++++++++++ agent/templates/ingestion_pipeline_One.json | 443 +++++++++++++ agent/templates/ingestion_pipeline_Paper.json | 555 ++++++++++++++++ 6 files changed, 3168 insertions(+) create mode 100644 agent/templates/ingestion_pipeline_Book.json create mode 100644 agent/templates/ingestion_pipeline_General.json create mode 100644 agent/templates/ingestion_pipeline_Laws.json create mode 100644 agent/templates/ingestion_pipeline_Manual.json create mode 100644 agent/templates/ingestion_pipeline_One.json create mode 100644 agent/templates/ingestion_pipeline_Paper.json diff --git a/agent/templates/ingestion_pipeline_Book.json b/agent/templates/ingestion_pipeline_Book.json new file mode 100644 index 0000000000..6136bacc74 --- /dev/null +++ b/agent/templates/ingestion_pipeline_Book.json @@ -0,0 +1,572 @@ +{ + "id": 29, + "title": { + "en": "Book", + "de": "Buch", + "zh": "书籍" + }, + "description": { + "en": "This template segments parsed files based on the structure of a book. It is suitable for documents with clearly defined chapters and sections, such as books, long-form manuscripts, literary works, and other chapter-based texts.", + "de": "Diese Vorlage segmentiert die geparste Datei anhand der Struktur eines Buches. Sie eignet sich für Dokumente mit klar definierten Kapiteln und Abschnitten, wie Bücher, längere Manuskripte, literarische Werke und andere kapitelbasierte Texte.", + "zh": "此模板将解析后的文件按书籍结构进行切片,适用于具有清晰章节层级的文档类型,如书籍、长篇手稿、文学作品及其他按章节组织的文本。" + }, + "canvas_type": "Ingestion Pipeline", + "canvas_category": "dataflow_canvas", + "dsl": { + "components": { + "File": { + "downstream": [ + "Parser:HipSignsRhyme" + ], + "obj": { + "component_name": "File", + "params": {} + }, + "upstream": [] + }, + "Parser:HipSignsRhyme": { + "downstream": [ + "TitleChunker:RealEggsHope" + ], + "obj": { + "component_name": "Parser", + "params": { + "outputs": { + "html": { + "type": "string", + "value": "" + }, + "json": { + "type": "Array", + "value": [] + }, + "markdown": { + "type": "string", + "value": "" + }, + "text": { + "type": "string", + "value": "" + } + }, + "setups": { + "doc": { + "output_format": "json", + "preprocess": [ + "main_content" + ], + "suffix": [ + "doc" + ] + }, + "docx": { + "output_format": "json", + "preprocess": [ + "main_content" + ], + "suffix": [ + "docx" + ], + "vlm": {} + }, + "email": { + "fields": [ + "from", + "to", + "cc", + "bcc", + "date", + "subject", + "body", + "attachments" + ], + "output_format": "text", + "preprocess": [ + "main_content" + ], + "suffix": [ + "eml", + "msg" + ] + }, + "html": { + "output_format": "json", + "preprocess": [ + "main_content" + ], + "suffix": [ + "htm", + "html" + ] + }, + "image": { + "output_format": "text", + "parse_method": "ocr", + "preprocess": [ + "main_content" + ], + "suffix": [ + "jpg", + "jpeg", + "png", + "gif" + ] + }, + "markdown": { + "output_format": "json", + "preprocess": [ + "main_content" + ], + "suffix": [ + "md", + "markdown", + "mdx" + ], + "vlm": {} + }, + "pdf": { + "output_format": "json", + "parse_method": "DeepDOC", + "preprocess": [ + "main_content" + ], + "remove_toc": true, + "suffix": [ + "pdf" + ], + "vlm": {} + }, + "text&code": { + "output_format": "json", + "preprocess": [ + "main_content" + ], + "suffix": [ + "txt", + "py", + "js", + "java", + "c", + "cpp", + "h", + "php", + "go", + "ts", + "sh", + "cs", + "kt", + "sql" + ] + } + } + } + }, + "upstream": [ + "File" + ] + }, + "TitleChunker:RealEggsHope": { + "downstream": [ + "Tokenizer:FunkyBucketsReport" + ], + "obj": { + "component_name": "TitleChunker", + "params": { + "hierarchy": 5, + "include_heading_content": true, + "levels": [ + [ + "^#[^#]", + "^##[^#]", + "^###[^#]", + "^####[^#]" + ], + [ + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+(\u5206?\u7f16|\u90e8\u5206)", + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u7ae0", + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u8282", + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u6761", + "[\\(\uff08][\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[\\)\uff09]" + ], + [ + "\u7b2c[0-9]+\u7ae0", + "\u7b2c[0-9]+\u8282", + "[0-9]{1,2}[\\. \u3001]", + "[0-9]{1,2}\\.[0-9]{1,2}($|[^a-zA-Z/%~.-])", + "[0-9]{1,2}\\.[0-9]{1,2}\\.[0-9]{1,2}" + ], + [ + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u7ae0", + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u8282", + "[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[ \u3001]", + "[\\(\uff08][\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[\\)\uff09]", + "[\\(\uff08][0-9]{,2}[\\)\uff09]" + ], + [ + "PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)", + "Chapter (I+V?|VI*|XI|IX|X)", + "Section [0-9]+", + "Article [0-9]+" + ] + ], + "method": "hierarchy" + } + }, + "upstream": [ + "Parser:HipSignsRhyme" + ] + }, + "Tokenizer:FunkyBucketsReport": { + "downstream": [], + "obj": { + "component_name": "Tokenizer", + "params": { + "fields": "text", + "filename_embd_weight": 0.1, + "outputs": {}, + "search_method": [ + "embedding", + "full_text" + ] + } + }, + "upstream": [ + "TitleChunker:RealEggsHope" + ] + } + }, + "globals": { + "sys.conversation_turns": 0, + "sys.date": "", + "sys.files": [], + "sys.history": [], + "sys.query": "", + "sys.user_id": "" + }, + "graph": { + "edges": [ + { + "data": { + "isHovered": false + }, + "id": "xy-edge__Filestart-Parser:HipSignsRhymeend", + "source": "File", + "sourceHandle": "start", + "target": "Parser:HipSignsRhyme", + "targetHandle": "end" + }, + { + "data": { + "isHovered": false + }, + "id": "xy-edge__Parser:HipSignsRhymestart-TitleChunker:RealEggsHopeend", + "source": "Parser:HipSignsRhyme", + "sourceHandle": "start", + "target": "TitleChunker:RealEggsHope", + "targetHandle": "end" + }, + { + "data": { + "isHovered": false + }, + "id": "xy-edge__TitleChunker:RealEggsHopestart-Tokenizer:FunkyBucketsReportend", + "source": "TitleChunker:RealEggsHope", + "sourceHandle": "start", + "target": "Tokenizer:FunkyBucketsReport", + "targetHandle": "end" + } + ], + "nodes": [ + { + "data": { + "label": "File", + "name": "File" + }, + "id": "File", + "measured": { + "height": 49, + "width": 200 + }, + "position": { + "x": 50, + "y": 200 + }, + "sourcePosition": "left", + "targetPosition": "right", + "type": "beginNode" + }, + { + "data": { + "form": { + "outputs": { + "html": { + "type": "string", + "value": "" + }, + "json": { + "type": "Array", + "value": [] + }, + "markdown": { + "type": "string", + "value": "" + }, + "text": { + "type": "string", + "value": "" + } + }, + "setups": [ + { + "fileFormat": "pdf", + "output_format": "json", + "parse_method": "DeepDOC", + "preprocess": [ + "main_content" + ], + "remove_toc": true, + "vlm": {} + }, + { + "fileFormat": "image", + "output_format": "text", + "parse_method": "ocr", + "preprocess": [ + "main_content" + ] + }, + { + "fields": [ + "from", + "to", + "cc", + "bcc", + "date", + "subject", + "body", + "attachments" + ], + "fileFormat": "email", + "output_format": "text", + "preprocess": [ + "main_content" + ] + }, + { + "fileFormat": "markdown", + "output_format": "json", + "preprocess": [ + "main_content" + ], + "remove_toc": true, + "vlm": {} + }, + { + "fileFormat": "text&code", + "output_format": "json", + "preprocess": [ + "main_content" + ] + }, + { + "fileFormat": "html", + "output_format": "json", + "preprocess": [ + "main_content" + ], + "remove_toc": true + }, + { + "fileFormat": "doc", + "output_format": "json", + "preprocess": [ + "main_content" + ] + }, + { + "fileFormat": "docx", + "output_format": "json", + "preprocess": [ + "main_content" + ], + "remove_toc": true, + "vlm": {} + } + ] + }, + "label": "Parser", + "name": "Parser_0" + }, + "dragging": false, + "id": "Parser:HipSignsRhyme", + "measured": { + "height": 57, + "width": 200 + }, + "position": { + "x": 316.99524094206413, + "y": 195.39629819663406 + }, + "selected": true, + "sourcePosition": "right", + "targetPosition": "left", + "type": "parserNode" + }, + { + "data": { + "form": { + "hierarchy": "5", + "include_heading_content": true, + "method": "hierarchy", + "outputs": { + "chunks": { + "type": "Array", + "value": [] + } + }, + "rules": [ + { + "levels": [ + { + "expression": "^#[^#]" + }, + { + "expression": "^##[^#]" + }, + { + "expression": "^###[^#]" + }, + { + "expression": "^####[^#]" + } + ] + }, + { + "levels": [ + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+(\u5206?\u7f16|\u90e8\u5206)" + }, + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u7ae0" + }, + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u8282" + }, + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u6761" + }, + { + "expression": "[\\(\uff08][\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[\\)\uff09]" + } + ] + }, + { + "levels": [ + { + "expression": "\u7b2c[0-9]+\u7ae0" + }, + { + "expression": "\u7b2c[0-9]+\u8282" + }, + { + "expression": "[0-9]{1,2}[\\. \u3001]" + }, + { + "expression": "[0-9]{1,2}\\.[0-9]{1,2}($|[^a-zA-Z/%~.-])" + }, + { + "expression": "[0-9]{1,2}\\.[0-9]{1,2}\\.[0-9]{1,2}" + } + ] + }, + { + "levels": [ + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u7ae0" + }, + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u8282" + }, + { + "expression": "[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[ \u3001]" + }, + { + "expression": "[\\(\uff08][\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[\\)\uff09]" + }, + { + "expression": "[\\(\uff08][0-9]{,2}[\\)\uff09]" + } + ] + }, + { + "levels": [ + { + "expression": "PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)" + }, + { + "expression": "Chapter (I+V?|VI*|XI|IX|X)" + }, + { + "expression": "Section [0-9]+" + }, + { + "expression": "Article [0-9]+" + } + ] + } + ] + }, + "label": "TitleChunker", + "name": "Title Chunker_0" + }, + "id": "TitleChunker:RealEggsHope", + "measured": { + "height": 74, + "width": 200 + }, + "position": { + "x": 616.9952409420641, + "y": 195.39629819663406 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "chunkerNode" + }, + { + "data": { + "form": { + "fields": "text", + "filename_embd_weight": 0.1, + "outputs": {}, + "search_method": [ + "embedding", + "full_text" + ] + }, + "label": "Tokenizer", + "name": "Indexer_0" + }, + "id": "Tokenizer:FunkyBucketsReport", + "measured": { + "height": 114, + "width": 200 + }, + "position": { + "x": 916.9952409420641, + "y": 195.39629819663406 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "tokenizerNode" + } + ] + }, + "history": [], + "path": [], + "retrieval": [], + "variables": [] + }, + "avatar": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADAAAAAwCAYAAABXAvmHAAAACXBIWXMAABYlAAAWJQFJUiTwAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAABMaSURBVHgBbVprjF3Vdf72Puc+5s7Lnhm/x/ZgCHWAFLdVKEqogJY2ikIJEUSBhiZFIaVRKZj+KFHa1G7apBS1KlGpkh99EEUgykMiKlFpFCUIyS1pSTFCQLADHmNjjz3jec99nXP27rfW3ufOtZOxru/cc8/Zez2+9a3HHoMLfv7skjuuy/Li4wbmZn6c8t7rdWMMnHOw1kKuyWf56f9dfuSe8rO8y/fyjFyXn/7fy+/llSRJeNbyWW5pZQ2+jOzvcbiW1A67wv3FwelHp/vl7e18cOr3NrTQOcCb9/cLVQpcblIUxfrDZl0B2UffRSlVQp5z5ylXGqEnbFSg/935gsIn4bmoPC8i4T+qw//xMJ8URRZ7CqjwvvVDY5J9pcVE0J4l5Tbjoxg+COc8wtdiUW6a2AuEkZco7HrrqTAXeKp8uai0tWKsILzjq5Kk+rvhtcRENbw/nAPXixK6Yst1DvChfaUAsplYqTSzRxGFFeFsUEldbPW65b1BIYuAOLm5vGZU6XK9fkiWgpfXre69rlgqcKMXfbhB1ynkszH7qkgP6DpfnLptiiIc+xlXRryLRZwzUSgfrBE3UGGCsYKgIkwvNoJFXQ+O8nLhnd7xbh3Fcm+aprqn/B7ihEpH+NJMVM6qBxROGhv8nNrr6R97oMR4vwLBwtS4KGXyKoKJiyo+uai8yz9ZmDsHTIoBihA/Vr9yEXrBa/3wRLAvCnlAl0j5lqvysk4JOlnDYj3WqCJcbm5Oucw+9LmzDFoIbHxF9lPLCkxUEB/CIeDZB6bgPy6G7rJHa1UepZUq0BvTmkc3p7I0XrfNbQuiV7DM640hoFI3PYsHAnJBaBNgozKlia7psG7kQgnGfTylCL3ADewQAtCaCq3iVFi57n0ZxxSKgeW4YLaWYHm+wMQu4PIbEuy5po6pq6sY2lxFbUiYhJtXJOC76sW800a3VaC1lGH+WAevfx+YfsngnVcdOh1gYrMoI1okEQnB43mWhz0FlqqZwInXvZ9KSyT6qK08ldBcOYOpYgOG1SCBj8kKFcyeLDC2HfitL6b41Ts3YnCsyuclaMUTEqxpQIawTyYWr/P5nJasoz7kUB8BxnYWeN+vZ7ylzRubOPGywXNfK/Dysx5jVKQ2ZHpxkiZJdEjwgigpwSxBzyD+Hd+fUKzC2vQeEOsrZBKDueMeOy73+My/jGPylwaVZfIOnxNLwcYArfHBNCakAM3CdSKfFxSgUHgqMn0WmAti2Sbh1ETW9njij7p44Z8Mtl7MjQuD3NEDgkFCz0aCSAQRjBfzJ7tu8/38LIuLNW0P38TumlGP7P+Pcey+aphYLoKwIqCuWGbeCoUSxqgqjeqLMIKwmI15hEIrNjTKhXWohFKjC5jPm6gNLmPpdII//UAbI42UMPS9xBYBQcNW4POuGGZd+BC8gS4Fe0nF4MxJh6tur+Dvz+3C9l+s00KF4tBqLhAz2kiGRtlIc7FxCGGWR5bxwUO6vHjIqjVBkoCv8zKvoaoGSNJBenUbGmMpHjlXR23MIWtCvR1ShlcP5HmHy9DQJfbLNG5iGqd3cOJoji+9MIpbH96C9mre5yWjTlcLK3UmgszoCeFNocGcG7XVo/QhBchDHhBgm1TjAvSUsXUuMUAv1PhkVVJeVHgcnZUBHHi1hsYWmqMLJQUJYMn6gXByiYP1pJQkAcc2yXH6bY+HT2zC7g+O0OqZJI31hCWL9FQxIT0pZYQrmhlc4H7hdNqL75n+rt4RZeQVadkrxGp8Mg0wdMryvDSK9nJQIpfMzF0phvyvDAlFZn+1KfydOJx6exUPvTNAOqxrginpUwJTaBa+QAB/zF8avEmkuBjLNigCTUBiUXmGQUxKVY+YbogPUUrjqIjJi+ukkn8SVdRWGXOrA/jq0RSnT9gASGI/JUTSpBo8UP6kaQ0nj5/CAz8YwshWWi4vgj1NSaX0DuPTVEJG1LRvArVJUJbcDR/hHmsmX8a5QFUh5VRgR3ZSr/iuKufFMLoW73Oh9lRMpKM0bB33fzfFwntSQlQDCRRFTHpRifbaKq65NcHe62twmVjcoxfWQia1HN3/ZoJ7ja6uZqqQjym+lFxVtSYKG12EsqI1amWtVK1Rz3ifa5wInuV3Lx6PPtPiORQQvD6GvR9hsvwVq/obk4ZCslf703ULi0v43GMJceeDlWghNZaEaIMlwWND8Ic3onP8MnS+OwE7UESRgyAmVnaa9MI+uk6onxxClZwHWq1KVoSSRcgLwWCI0AtwsiGe4nveGcPvfstg4WQwqhBOWtJnc6GN2/+S6bnDJCG1h9zilkk0tUCRtEz97Xk8Pd3G8OvL+MgHh9Tlap+kiAFMjs9imS0QkDRQkYDshAAVtDDJoc79/nqNXmzAjo7x+Qz1O5cgpKV0K0nLB8MZG+GictYwfkkVu65sY+1cTPbhC2BluYXr/jBRo5sB1i4NWm6IbFOZ0x5AIr+5N8dH3RFcu4MiTS2jLIWz13IUR8gUL5HTc6+ZVZKUm8/R+ocM2ROb0f7HDXAzRulVY0NMVyO77RyDHaoBfQSgHtLE63WdkrYlfopsENfur6C1EAwfPZDgihtaqDQqrCgrWLyXbDTWgFmqo3LTGmrXnIVrT6DxKSaZD+/hQh0kO7h4h0ueoIBP8769m+iwFtIhMsQVThMP2ty8VdUaCBsp1/wqksk8JLm61yDUGHL1YEUtF6R65B0nqMQa8wLvSd/Hmollp2ed5fIBXPnJJTx2l0ddiEcy8dKcwyfvJKF1+fACrfpTJpbLKrDDQzDkYVTP8Muz3HAUlUlCB51o5Rqtx70Z3KgSTiMENStU1yT9DZP5zwSaLLot3joA985GZPMrsFe1lNkMS2tTSbRg1PpLvHaOBnl8mEmaMBwkfCfG0P7PWaQ3riK9lFDM2IvVath2aYHOWgzi1lqGi65OlGmKFaJqwCvmpW5xJ+vIn98KP78RrecXsPild9H+9jA6j04qBTYfYj0yxCTVIqZrxPvRceSPjCP7cYLOE23kSRN+cQEFGc4WTFJHqfFZ0uM4PdIlVbeoXB7YSj43H+Q64/x+iJ9ZelvmjfTGK+AObaMsbHZMoeX2jl9mvHZDjkGNTcXoFqkMCix9jiuPtAiZDvKVReQnV5AdGoQ7shHFu6TQt7jpm6S9I07rFzfIjZjn3fwZZEvzsFwLW1kWMJvaBkOSybForwlHI1+ch1tqo5ghxCq55kLDfZyGboLuczTGFqv5xxF+tVt4f5fB/cpRXk+QvUg4VmU/i0t+zbC8EQ8wG6YSTLSe9Jm+0QrU2WqH9q5FBWbm4Igiu0XqfRfoT8KnWEb3++eo5CotTAaiIvnyIpVaQcJEiPdzHSEFF5KTN02kt88i2dcOI5p2F4WYX5NhB8Xb9KRlXHQpzy8Q95u4z5UrVIJ7Dg+gOEuSyGSa0ozVrQmdh7R1RlxT5YXrFlH83wjSDXGKQLkGPk+B9tAb38tC+8fOyjpudsZg4sU65m+it5hDLC2bvr/A0KcyureKwU/XsPoAE15DWIV7NBh0O0KKwTDTVJUlA+PDJbEIIft4ZvmCcPT/W0XruYwC0rugTG+cRn6Ke5/LVTHJF1IBSAeNTsvp5MGteIwfaGHmY4Mx6TAWWBvVrpqhcTfpgwVd6+n2bG0O1VzqGEKCyhta1Fa5wYzIRwt1aZBkBe3DVHa8huomQmZMatY1LZ+zmSY6nO7U5hhJZ1mHfrTNzy0tVQRCtcmthNgy8qECFRF4fB5VNlF2AzSTv/tKjsqADhbYrGRJgIYMpyQ1DBKXnSxMJ5h4Cr/C9zmt31Uxuth3GaCLzBcp7006yn6av4tQWkhPI6ySWyojGUzY5mzIzKJ0MsHnzzaVJs0EKZuUbHc4rY/ABqbzkzkM3gts+HNS9LWDKN4cw+Bt/J6VsWTs4y8TahVSbMYFOq0EK7MOg6NpUEA5N9dJgF8KDbSrrdGSpL+V4ZBsBIPLRtO8tWSrLgMwJ94W6fRXKSyhVv8AP9cC3+fzC8wRG7D85RFSLK9/aE55X/KFXWshf9dj+L4aZm9hgtxtUB1vYPFuqVAp8KAQAjD72Rzj3yIdM1n+9MccJmyXYk/qbGaEk69IH+C0bExHwzApW1lCZdRg9cuXYvUru6kA+b8IjY2tMXxcNXQGjULHJfnCImG3htW/IX4fMRpi9YtZGHZYPmcM8MU5ZCcW0ZleQTrFpMkmqXN6Bu2ZU8jfY098kUXjrgbzRY5sgcw1wudG2wyNDtpvNVH/zdB9vPF8V9vMnHuS5XIMb/T40TMDSOqxq9oiKZva89UkPeYn1tB9nYsRw55VanduHt1zM+i8tSLRiIGbqsinGYz0QLG8yoXPoZhd1e5q8H4q80YHnRmuwXqrOMWAn2a9VR2Af69AdqqF7hnCrE64tXOMfmEQm57agOoV5HnXQpEwe+9rYvxpzpFuJ4ngFJ56IMGGTVYJJfVW23b86Nk67vrXeQqQYPR+BvJVzMZ7ibGBFNkiLXySw6h6Fdm79FLKYGMKrjJJOTYXw5/eSGWXsPJkUz1XmRrEwC0VrcEqmxrYfsig9b22xk5lNwP3QyPaju48MsEYypU65bNrGe17k51VjH+NuUTgIx2ddGhNBn97Ecff3IFT73SxeZsJY8t7dn/Cy3zl7HsGX32xicnLCRG2lORBrD7J4Fqp090G9RsYuCkZaI3ZsUFGkdGbVJorYT5qBya08gzkRmxnQrdhhqOznKrT6tUwi0pTZmUWKgFsE633VRiUg0YTSEEqPu3g5hW6tdEE909uRqPO/JE77Y1TmdNI1pzYZvHsQzX88dMdjlGUXTH0GVIgs7OXEqAttQvdNsDsm8vMk4tnTvldq8XOLJlEMtxGhBrRxQTldH6jVW457BWl8kJrYR9LZc0vvtRAUjQfKGaVIArOhqrDBZ55YAK+SdarlCP+MJlRHvekukPfSTB7rBYsJZVhu07XyVDT9pr28J1Z3zj0mvqbNi3ClcWMDqpCH1kJgwgbuiw5vNB3W1HYGCvUnMR2knRsyELFe3xmVntmR0Ml9N7xlzfh3/+WcbDBhi6tPLu4e/dv+3I2L8YsWN9849gSOsuxK4oNj5YVYSxRdvKhrkec8dv13tqqpcNMSEoOJ7WHvvrH7OGl4xcd+BZxkFaeIwSDWXqiubwV93CUObmHeMnXzxaw3tAEDJL2sTTfwTNfGUGN5XARRx9hcp2tN8cWsdU0ejpTCu/j8ZPTd6tDrzDxliZmVV9MHrxR3ltctxuE13lmGsUxITdYozmgm+/CfuaFyT0IBaI9/yTIlick2pny95Ex4Nt/1cUP/nmCg9jQcSHCpsQ14kymHK+XjZQJg1VVSC0kQy8bsjNC4RuGYOWYMHpOJ0s2Ch8hadjWrs7vwhcmOti2KxyIlOdz60dS8fzgvPEib9zGbuvv7l7DoSfHGPmpWlmtKzjtwcWFXjXOs1x5nGTCJFvrKHlOsRbGgWH86OKcqZxkmHh0EkoQySXV4XG8yqHBvbta2Dkl4/Vg5DDm9+cdT6nk/SePckPOVL2bDz702Sa++fvDqG8YZeYLxz/heChbh1KRx+MkH87ElP6CYOKR0u4hiONJjonHqAFwMW5yJIMkDTeJB1nrf/OODmFDGOcXHGn1/Zzngf6jVR2J88Ht2y0OkVbv2EoWeG0CtZEG+wYXBDExEJM45NVhfpxOqLfitDlu6uMYMdBuPIlh4EoZUx0awOyJSXzj1gYh08UCx/gbNzmNof7J4XnHX/Ga+fzOjx2jAlMXnpP1/yTc9PSMxIfFJ+6p4Dfu7HByF4a2RbfQDkkSlMjv4sFGSEZGPSOFn01lEhEn1aTOrF3F6TfqePmpKv7rcYf50w6bthht3vvZr1+W/jPr+PmwuXvqpocZTPdpIpEpQZ+25an6uubkjhWDRTb+W5mdJy9OcNm1wJ4rqdxmh6Ex1u41E6bIjqc8rJvaqyzaWJedOcoDErakM0cSHPsf/n7SabW7YQJ6jlYeHQHnH7/2jz4vPKLl50fNH0zeeB3d+sPeGXHfue3Pw14/A8hkrLPKcrwjR0m836WKnCSOyKUMlya8SkapsdlJGUfVmnRiYVyox6fW9M4lfCwngggmkFyk6PKwvPxrgbC/v0i/vWfq5od5NnBfUSYqH2ecfVr/vL9x6FkG0q8k8ShUYJ7G46UQJjaumejELfxuje0dbocgNnF+2r9vaPZ1vBgR0Qf1rz84/fh+BRu3O0gMH05NEvNgSfP+PDf2w6o/qGTzXIdUIQY4CeIJYhZO2TW7h3O03Mv1XGdCPl5DzOQuFnQlZNEL9aKnzPofkrjDdVQPanzKfy8t/qT94Y17/43pm1Mse3UYnOK8aD8/Ftb/aOPCv4HQQ27JpElpufL0PemdQ2sxF1mqnFgrw8bEmJh+9oFO8kIO0Cz99QHbvvPg9BPrf+zR/7N/6uap3JmD9N2VXHKfFnsuTJh7R1F97/0e6o+Tdag5hY4QRJpUNCsnNhyb2pgvUAQFbYn7+Hyyvs40vfYdDoyffXD6iRf69/p/CbMWUUVYM2EAAAAASUVORK5CYII=" +} diff --git a/agent/templates/ingestion_pipeline_General.json b/agent/templates/ingestion_pipeline_General.json new file mode 100644 index 0000000000..940b5d3c07 --- /dev/null +++ b/agent/templates/ingestion_pipeline_General.json @@ -0,0 +1,446 @@ +{ + "id": 33, + "title": { + "en": "General", + "de": "Allgemein", + "zh": "通用" + }, + "description": { + "en": "This template uses a general segmentation strategy and splits parsed files based on token count. It is suitable for a wide range of document types with no fixed structural patterns.", + "de": "Diese Vorlage verwendet eine allgemeine Segmentierungsstrategie und teilt die geparste Datei anhand der Token-Anzahl auf. Sie eignet sich für verschiedenste Dokumenttypen, wenn kein spezifisches Strukturmuster erforderlich ist.", + "zh": "此模板采用通用切分逻辑,按照 token 数量对解析后的文件进行切片,适用于不依赖特定结构模式的各类文档。" + }, + "canvas_type": "Ingestion Pipeline", + "canvas_category": "dataflow_canvas", + "dsl": { + "components": { + "File": { + "downstream": [ + "Parser:HipSignsRhyme" + ], + "obj": { + "component_name": "File", + "params": {} + }, + "upstream": [] + }, + "Parser:HipSignsRhyme": { + "downstream": [ + "TokenChunker:TameMangosKick" + ], + "obj": { + "component_name": "Parser", + "params": { + "outputs": { + "html": { + "type": "string", + "value": "" + }, + "json": { + "type": "Array", + "value": [] + }, + "markdown": { + "type": "string", + "value": "" + }, + "text": { + "type": "string", + "value": "" + } + }, + "setups": { + "doc": { + "output_format": "json", + "preprocess": "main_content", + "suffix": [ + "doc" + ] + }, + "docx": { + "output_format": "json", + "preprocess": "main_content", + "suffix": [ + "docx" + ], + "vlm": {} + }, + "email": { + "fields": [ + "from", + "to", + "cc", + "bcc", + "date", + "subject", + "body", + "attachments" + ], + "output_format": "text", + "preprocess": "main_content", + "suffix": [ + "eml", + "msg" + ] + }, + "html": { + "output_format": "json", + "preprocess": "main_content", + "suffix": [ + "htm", + "html" + ] + }, + "image": { + "output_format": "text", + "parse_method": "ocr", + "preprocess": "main_content", + "suffix": [ + "jpg", + "jpeg", + "png", + "gif" + ], + "system_prompt": "" + }, + "markdown": { + "output_format": "json", + "preprocess": "main_content", + "suffix": [ + "md", + "markdown", + "mdx" + ], + "vlm": {} + }, + "pdf": { + "output_format": "json", + "parse_method": "DeepDOC", + "preprocess": "main_content", + "suffix": [ + "pdf" + ], + "vlm": {} + }, + "slides": { + "output_format": "json", + "parse_method": "DeepDOC", + "preprocess": "main_content", + "suffix": [ + "pptx", + "ppt" + ] + }, + "spreadsheet": { + "output_format": "html", + "parse_method": "DeepDOC", + "preprocess": "main_content", + "suffix": [ + "xls", + "xlsx", + "csv" + ], + "vlm": {} + }, + "text&code": { + "output_format": "json", + "preprocess": "main_content", + "suffix": [ + "txt", + "py", + "js", + "java", + "c", + "cpp", + "h", + "php", + "go", + "ts", + "sh", + "cs", + "kt", + "sql" + ] + } + } + } + }, + "upstream": [ + "File" + ] + }, + "TokenChunker:TameMangosKick": { + "downstream": [ + "Tokenizer:StrongCoatsTalk" + ], + "obj": { + "component_name": "TokenChunker", + "params": { + "children_delimiters": [], + "chunk_token_size": 512, + "delimiter_mode": "token_size", + "delimiters": [], + "image_context_size": 0, + "outputs": { + "chunks": { + "type": "Array", + "value": [] + } + }, + "overlapped_percent": 0, + "table_context_size": 0 + } + }, + "upstream": [ + "Parser:HipSignsRhyme" + ] + }, + "Tokenizer:StrongCoatsTalk": { + "downstream": [], + "obj": { + "component_name": "Tokenizer", + "params": { + "fields": "text", + "filename_embd_weight": 0.1, + "outputs": {}, + "search_method": [ + "embedding", + "full_text" + ] + } + }, + "upstream": [ + "TokenChunker:TameMangosKick" + ] + } + }, + "globals": { + "sys.history": [] + }, + "graph": { + "edges": [ + { + "data": { + "isHovered": false + }, + "id": "xy-edge__Filestart-Parser:HipSignsRhymeend", + "source": "File", + "sourceHandle": "start", + "target": "Parser:HipSignsRhyme", + "targetHandle": "end" + }, + { + "id": "xy-edge__Parser:HipSignsRhymestart-TokenChunker:TameMangosKickend", + "source": "Parser:HipSignsRhyme", + "sourceHandle": "start", + "target": "TokenChunker:TameMangosKick", + "targetHandle": "end" + }, + { + "id": "xy-edge__TokenChunker:TameMangosKickstart-Tokenizer:StrongCoatsTalkend", + "source": "TokenChunker:TameMangosKick", + "sourceHandle": "start", + "target": "Tokenizer:StrongCoatsTalk", + "targetHandle": "end" + } + ], + "nodes": [ + { + "data": { + "label": "File", + "name": "File" + }, + "id": "File", + "measured": { + "height": 49, + "width": 200 + }, + "position": { + "x": 50, + "y": 200 + }, + "sourcePosition": "left", + "targetPosition": "right", + "type": "beginNode" + }, + { + "data": { + "form": { + "outputs": { + "html": { + "type": "string", + "value": "" + }, + "json": { + "type": "Array", + "value": [] + }, + "markdown": { + "type": "string", + "value": "" + }, + "text": { + "type": "string", + "value": "" + } + }, + "setups": [ + { + "fileFormat": "pdf", + "output_format": "json", + "parse_method": "DeepDOC", + "preprocess": "main_content" + }, + { + "fileFormat": "spreadsheet", + "output_format": "html", + "parse_method": "DeepDOC", + "preprocess": "main_content" + }, + { + "fileFormat": "image", + "output_format": "text", + "parse_method": "ocr", + "preprocess": "main_content", + "system_prompt": "" + }, + { + "fields": [ + "from", + "to", + "cc", + "bcc", + "date", + "subject", + "body", + "attachments" + ], + "fileFormat": "email", + "output_format": "text", + "preprocess": "main_content" + }, + { + "fileFormat": "markdown", + "output_format": "json", + "preprocess": "main_content" + }, + { + "fileFormat": "text&code", + "output_format": "json", + "preprocess": "main_content" + }, + { + "fileFormat": "html", + "output_format": "json", + "preprocess": "main_content" + }, + { + "fileFormat": "doc", + "output_format": "json", + "preprocess": "main_content" + }, + { + "fileFormat": "docx", + "output_format": "json", + "preprocess": "main_content" + }, + { + "fileFormat": "slides", + "output_format": "json", + "parse_method": "DeepDOC", + "preprocess": "main_content" + } + ] + }, + "label": "Parser", + "name": "Parser_0" + }, + "dragging": false, + "id": "Parser:HipSignsRhyme", + "measured": { + "height": 57, + "width": 200 + }, + "position": { + "x": 316.99524094206413, + "y": 195.39629819663406 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "parserNode" + }, + { + "data": { + "form": { + "children_delimiters": [], + "chunk_token_size": 512, + "delimiter_mode": "token_size", + "delimiters": [ + { + "value": "\n" + } + ], + "image_table_context_window": 0, + "outputs": { + "chunks": { + "type": "Array", + "value": [] + } + }, + "overlapped_percent": 0 + }, + "label": "TokenChunker", + "name": "Token Chunker_0" + }, + "id": "TokenChunker:TameMangosKick", + "measured": { + "height": 74, + "width": 200 + }, + "position": { + "x": 616.9952409420641, + "y": 195.39629819663406 + }, + "selected": true, + "sourcePosition": "right", + "targetPosition": "left", + "type": "chunkerNode" + }, + { + "data": { + "form": { + "fields": "text", + "filename_embd_weight": 0.1, + "outputs": {}, + "search_method": [ + "embedding", + "full_text" + ] + }, + "label": "Tokenizer", + "name": "Indexer_0" + }, + "id": "Tokenizer:StrongCoatsTalk", + "measured": { + "height": 114, + "width": 200 + }, + "position": { + "x": 916.9952409420641, + "y": 195.39629819663406 + }, + "sourcePosition": "right", + "targetPosition": "left", + "type": "tokenizerNode" + } + ] + }, + "history": [], + "messages": [], + "path": [], + "retrieval": [], + "variables": [] + }, + "avatar": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADAAAAAwCAYAAABXAvmHAAAACXBIWXMAABYlAAAWJQFJUiTwAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAABMaSURBVHgBbVprjF3Vdf72Puc+5s7Lnhm/x/ZgCHWAFLdVKEqogJY2ikIJEUSBhiZFIaVRKZj+KFHa1G7apBS1KlGpkh99EEUgykMiKlFpFCUIyS1pSTFCQLADHmNjjz3jec99nXP27rfW3ufOtZOxru/cc8/Zez2+9a3HHoMLfv7skjuuy/Li4wbmZn6c8t7rdWMMnHOw1kKuyWf56f9dfuSe8rO8y/fyjFyXn/7fy+/llSRJeNbyWW5pZQ2+jOzvcbiW1A67wv3FwelHp/vl7e18cOr3NrTQOcCb9/cLVQpcblIUxfrDZl0B2UffRSlVQp5z5ylXGqEnbFSg/935gsIn4bmoPC8i4T+qw//xMJ8URRZ7CqjwvvVDY5J9pcVE0J4l5Tbjoxg+COc8wtdiUW6a2AuEkZco7HrrqTAXeKp8uai0tWKsILzjq5Kk+rvhtcRENbw/nAPXixK6Yst1DvChfaUAsplYqTSzRxGFFeFsUEldbPW65b1BIYuAOLm5vGZU6XK9fkiWgpfXre69rlgqcKMXfbhB1ynkszH7qkgP6DpfnLptiiIc+xlXRryLRZwzUSgfrBE3UGGCsYKgIkwvNoJFXQ+O8nLhnd7xbh3Fcm+aprqn/B7ihEpH+NJMVM6qBxROGhv8nNrr6R97oMR4vwLBwtS4KGXyKoKJiyo+uai8yz9ZmDsHTIoBihA/Vr9yEXrBa/3wRLAvCnlAl0j5lqvysk4JOlnDYj3WqCJcbm5Oucw+9LmzDFoIbHxF9lPLCkxUEB/CIeDZB6bgPy6G7rJHa1UepZUq0BvTmkc3p7I0XrfNbQuiV7DM640hoFI3PYsHAnJBaBNgozKlia7psG7kQgnGfTylCL3ADewQAtCaCq3iVFi57n0ZxxSKgeW4YLaWYHm+wMQu4PIbEuy5po6pq6sY2lxFbUiYhJtXJOC76sW800a3VaC1lGH+WAevfx+YfsngnVcdOh1gYrMoI1okEQnB43mWhz0FlqqZwInXvZ9KSyT6qK08ldBcOYOpYgOG1SCBj8kKFcyeLDC2HfitL6b41Ts3YnCsyuclaMUTEqxpQIawTyYWr/P5nJasoz7kUB8BxnYWeN+vZ7ylzRubOPGywXNfK/Dysx5jVKQ2ZHpxkiZJdEjwgigpwSxBzyD+Hd+fUKzC2vQeEOsrZBKDueMeOy73+My/jGPylwaVZfIOnxNLwcYArfHBNCakAM3CdSKfFxSgUHgqMn0WmAti2Sbh1ETW9njij7p44Z8Mtl7MjQuD3NEDgkFCz0aCSAQRjBfzJ7tu8/38LIuLNW0P38TumlGP7P+Pcey+aphYLoKwIqCuWGbeCoUSxqgqjeqLMIKwmI15hEIrNjTKhXWohFKjC5jPm6gNLmPpdII//UAbI42UMPS9xBYBQcNW4POuGGZd+BC8gS4Fe0nF4MxJh6tur+Dvz+3C9l+s00KF4tBqLhAz2kiGRtlIc7FxCGGWR5bxwUO6vHjIqjVBkoCv8zKvoaoGSNJBenUbGmMpHjlXR23MIWtCvR1ShlcP5HmHy9DQJfbLNG5iGqd3cOJoji+9MIpbH96C9mre5yWjTlcLK3UmgszoCeFNocGcG7XVo/QhBchDHhBgm1TjAvSUsXUuMUAv1PhkVVJeVHgcnZUBHHi1hsYWmqMLJQUJYMn6gXByiYP1pJQkAcc2yXH6bY+HT2zC7g+O0OqZJI31hCWL9FQxIT0pZYQrmhlc4H7hdNqL75n+rt4RZeQVadkrxGp8Mg0wdMryvDSK9nJQIpfMzF0phvyvDAlFZn+1KfydOJx6exUPvTNAOqxrginpUwJTaBa+QAB/zF8avEmkuBjLNigCTUBiUXmGQUxKVY+YbogPUUrjqIjJi+ukkn8SVdRWGXOrA/jq0RSnT9gASGI/JUTSpBo8UP6kaQ0nj5/CAz8YwshWWi4vgj1NSaX0DuPTVEJG1LRvArVJUJbcDR/hHmsmX8a5QFUh5VRgR3ZSr/iuKufFMLoW73Oh9lRMpKM0bB33fzfFwntSQlQDCRRFTHpRifbaKq65NcHe62twmVjcoxfWQia1HN3/ZoJ7ja6uZqqQjym+lFxVtSYKG12EsqI1amWtVK1Rz3ifa5wInuV3Lx6PPtPiORQQvD6GvR9hsvwVq/obk4ZCslf703ULi0v43GMJceeDlWghNZaEaIMlwWND8Ic3onP8MnS+OwE7UESRgyAmVnaa9MI+uk6onxxClZwHWq1KVoSSRcgLwWCI0AtwsiGe4nveGcPvfstg4WQwqhBOWtJnc6GN2/+S6bnDJCG1h9zilkk0tUCRtEz97Xk8Pd3G8OvL+MgHh9Tlap+kiAFMjs9imS0QkDRQkYDshAAVtDDJoc79/nqNXmzAjo7x+Qz1O5cgpKV0K0nLB8MZG+GictYwfkkVu65sY+1cTPbhC2BluYXr/jBRo5sB1i4NWm6IbFOZ0x5AIr+5N8dH3RFcu4MiTS2jLIWz13IUR8gUL5HTc6+ZVZKUm8/R+ocM2ROb0f7HDXAzRulVY0NMVyO77RyDHaoBfQSgHtLE63WdkrYlfopsENfur6C1EAwfPZDgihtaqDQqrCgrWLyXbDTWgFmqo3LTGmrXnIVrT6DxKSaZD+/hQh0kO7h4h0ueoIBP8769m+iwFtIhMsQVThMP2ty8VdUaCBsp1/wqksk8JLm61yDUGHL1YEUtF6R65B0nqMQa8wLvSd/Hmollp2ed5fIBXPnJJTx2l0ddiEcy8dKcwyfvJKF1+fACrfpTJpbLKrDDQzDkYVTP8Muz3HAUlUlCB51o5Rqtx70Z3KgSTiMENStU1yT9DZP5zwSaLLot3joA985GZPMrsFe1lNkMS2tTSbRg1PpLvHaOBnl8mEmaMBwkfCfG0P7PWaQ3riK9lFDM2IvVath2aYHOWgzi1lqGi65OlGmKFaJqwCvmpW5xJ+vIn98KP78RrecXsPild9H+9jA6j04qBTYfYj0yxCTVIqZrxPvRceSPjCP7cYLOE23kSRN+cQEFGc4WTFJHqfFZ0uM4PdIlVbeoXB7YSj43H+Q64/x+iJ9ZelvmjfTGK+AObaMsbHZMoeX2jl9mvHZDjkGNTcXoFqkMCix9jiuPtAiZDvKVReQnV5AdGoQ7shHFu6TQt7jpm6S9I07rFzfIjZjn3fwZZEvzsFwLW1kWMJvaBkOSybForwlHI1+ch1tqo5ghxCq55kLDfZyGboLuczTGFqv5xxF+tVt4f5fB/cpRXk+QvUg4VmU/i0t+zbC8EQ8wG6YSTLSe9Jm+0QrU2WqH9q5FBWbm4Igiu0XqfRfoT8KnWEb3++eo5CotTAaiIvnyIpVaQcJEiPdzHSEFF5KTN02kt88i2dcOI5p2F4WYX5NhB8Xb9KRlXHQpzy8Q95u4z5UrVIJ7Dg+gOEuSyGSa0ozVrQmdh7R1RlxT5YXrFlH83wjSDXGKQLkGPk+B9tAb38tC+8fOyjpudsZg4sU65m+it5hDLC2bvr/A0KcyureKwU/XsPoAE15DWIV7NBh0O0KKwTDTVJUlA+PDJbEIIft4ZvmCcPT/W0XruYwC0rugTG+cRn6Ke5/LVTHJF1IBSAeNTsvp5MGteIwfaGHmY4Mx6TAWWBvVrpqhcTfpgwVd6+n2bG0O1VzqGEKCyhta1Fa5wYzIRwt1aZBkBe3DVHa8huomQmZMatY1LZ+zmSY6nO7U5hhJZ1mHfrTNzy0tVQRCtcmthNgy8qECFRF4fB5VNlF2AzSTv/tKjsqADhbYrGRJgIYMpyQ1DBKXnSxMJ5h4Cr/C9zmt31Uxuth3GaCLzBcp7006yn6av4tQWkhPI6ySWyojGUzY5mzIzKJ0MsHnzzaVJs0EKZuUbHc4rY/ABqbzkzkM3gts+HNS9LWDKN4cw+Bt/J6VsWTs4y8TahVSbMYFOq0EK7MOg6NpUEA5N9dJgF8KDbSrrdGSpL+V4ZBsBIPLRtO8tWSrLgMwJ94W6fRXKSyhVv8AP9cC3+fzC8wRG7D85RFSLK9/aE55X/KFXWshf9dj+L4aZm9hgtxtUB1vYPFuqVAp8KAQAjD72Rzj3yIdM1n+9MccJmyXYk/qbGaEk69IH+C0bExHwzApW1lCZdRg9cuXYvUru6kA+b8IjY2tMXxcNXQGjULHJfnCImG3htW/IX4fMRpi9YtZGHZYPmcM8MU5ZCcW0ZleQTrFpMkmqXN6Bu2ZU8jfY098kUXjrgbzRY5sgcw1wudG2wyNDtpvNVH/zdB9vPF8V9vMnHuS5XIMb/T40TMDSOqxq9oiKZva89UkPeYn1tB9nYsRw55VanduHt1zM+i8tSLRiIGbqsinGYz0QLG8yoXPoZhd1e5q8H4q80YHnRmuwXqrOMWAn2a9VR2Af69AdqqF7hnCrE64tXOMfmEQm57agOoV5HnXQpEwe+9rYvxpzpFuJ4ngFJ56IMGGTVYJJfVW23b86Nk67vrXeQqQYPR+BvJVzMZ7ibGBFNkiLXySw6h6Fdm79FLKYGMKrjJJOTYXw5/eSGWXsPJkUz1XmRrEwC0VrcEqmxrYfsig9b22xk5lNwP3QyPaju48MsEYypU65bNrGe17k51VjH+NuUTgIx2ddGhNBn97Ecff3IFT73SxeZsJY8t7dn/Cy3zl7HsGX32xicnLCRG2lORBrD7J4Fqp090G9RsYuCkZaI3ZsUFGkdGbVJorYT5qBya08gzkRmxnQrdhhqOznKrT6tUwi0pTZmUWKgFsE633VRiUg0YTSEEqPu3g5hW6tdEE909uRqPO/JE77Y1TmdNI1pzYZvHsQzX88dMdjlGUXTH0GVIgs7OXEqAttQvdNsDsm8vMk4tnTvldq8XOLJlEMtxGhBrRxQTldH6jVW457BWl8kJrYR9LZc0vvtRAUjQfKGaVIArOhqrDBZ55YAK+SdarlCP+MJlRHvekukPfSTB7rBYsJZVhu07XyVDT9pr28J1Z3zj0mvqbNi3ClcWMDqpCH1kJgwgbuiw5vNB3W1HYGCvUnMR2knRsyELFe3xmVntmR0Ml9N7xlzfh3/+WcbDBhi6tPLu4e/dv+3I2L8YsWN9849gSOsuxK4oNj5YVYSxRdvKhrkec8dv13tqqpcNMSEoOJ7WHvvrH7OGl4xcd+BZxkFaeIwSDWXqiubwV93CUObmHeMnXzxaw3tAEDJL2sTTfwTNfGUGN5XARRx9hcp2tN8cWsdU0ejpTCu/j8ZPTd6tDrzDxliZmVV9MHrxR3ltctxuE13lmGsUxITdYozmgm+/CfuaFyT0IBaI9/yTIlick2pny95Ex4Nt/1cUP/nmCg9jQcSHCpsQ14kymHK+XjZQJg1VVSC0kQy8bsjNC4RuGYOWYMHpOJ0s2Ch8hadjWrs7vwhcmOti2KxyIlOdz60dS8fzgvPEib9zGbuvv7l7DoSfHGPmpWlmtKzjtwcWFXjXOs1x5nGTCJFvrKHlOsRbGgWH86OKcqZxkmHh0EkoQySXV4XG8yqHBvbta2Dkl4/Vg5DDm9+cdT6nk/SePckPOVL2bDz702Sa++fvDqG8YZeYLxz/heChbh1KRx+MkH87ElP6CYOKR0u4hiONJjonHqAFwMW5yJIMkDTeJB1nrf/OODmFDGOcXHGn1/Zzngf6jVR2J88Ht2y0OkVbv2EoWeG0CtZEG+wYXBDExEJM45NVhfpxOqLfitDlu6uMYMdBuPIlh4EoZUx0awOyJSXzj1gYh08UCx/gbNzmNof7J4XnHX/Ga+fzOjx2jAlMXnpP1/yTc9PSMxIfFJ+6p4Dfu7HByF4a2RbfQDkkSlMjv4sFGSEZGPSOFn01lEhEn1aTOrF3F6TfqePmpKv7rcYf50w6bthht3vvZr1+W/jPr+PmwuXvqpocZTPdpIpEpQZ+25an6uubkjhWDRTb+W5mdJy9OcNm1wJ4rqdxmh6Ex1u41E6bIjqc8rJvaqyzaWJedOcoDErakM0cSHPsf/n7SabW7YQJ6jlYeHQHnH7/2jz4vPKLl50fNH0zeeB3d+sPeGXHfue3Pw14/A8hkrLPKcrwjR0m836WKnCSOyKUMlya8SkapsdlJGUfVmnRiYVyox6fW9M4lfCwngggmkFyk6PKwvPxrgbC/v0i/vWfq5od5NnBfUSYqH2ecfVr/vL9x6FkG0q8k8ShUYJ7G46UQJjaumejELfxuje0dbocgNnF+2r9vaPZ1vBgR0Qf1rz84/fh+BRu3O0gMH05NEvNgSfP+PDf2w6o/qGTzXIdUIQY4CeIJYhZO2TW7h3O03Mv1XGdCPl5DzOQuFnQlZNEL9aKnzPofkrjDdVQPanzKfy8t/qT94Y17/43pm1Mse3UYnOK8aD8/Ftb/aOPCv4HQQ27JpElpufL0PemdQ2sxF1mqnFgrw8bEmJh+9oFO8kIO0Cz99QHbvvPg9BPrf+zR/7N/6uap3JmD9N2VXHKfFnsuTJh7R1F97/0e6o+Tdag5hY4QRJpUNCsnNhyb2pgvUAQFbYn7+Hyyvs40vfYdDoyffXD6iRf69/p/CbMWUUVYM2EAAAAASUVORK5CYII=" +} diff --git a/agent/templates/ingestion_pipeline_Laws.json b/agent/templates/ingestion_pipeline_Laws.json new file mode 100644 index 0000000000..b2b86f6f04 --- /dev/null +++ b/agent/templates/ingestion_pipeline_Laws.json @@ -0,0 +1,600 @@ +{ + "id": 28, + "title": { + "en": "Laws", + "de": "Gesetzesbestimmungen", + "zh": "法律条文" + }, + "description": { + "en": "This template segments parsed files based on the structure of legal provisions. It is suitable for documents with clearly defined article hierarchies, such as laws, regulations, judicial interpretations, compliance policies, and other legal texts.", + "de": "Diese Vorlage segmentiert die geparste Datei anhand der Struktur von Gesetzesbestimmungen. Sie eignet sich für Dokumente mit klar definierten Artikel- und Paragraphenhierarchien, wie Gesetze, Verordnungen, richterliche Auslegungen, Compliance-Richtlinien und andere Rechtstexte.", + "zh": "此模板将解析后的文件按法律条文结构进行切片,适用于具有清晰条、款、项层级的文档类型,如法律、法规、司法解释、合规制度及其他法律文本。" + }, + "canvas_type": "Ingestion Pipeline", + "canvas_category": "dataflow_canvas", + "dsl": { + "components": { + "File": { + "downstream": [ + "Parser:HipSignsRhyme" + ], + "obj": { + "component_name": "File", + "params": {} + }, + "upstream": [] + }, + "Parser:HipSignsRhyme": { + "downstream": [ + "TitleChunker:SlimyPapayasAttend" + ], + "obj": { + "component_name": "Parser", + "params": { + "outputs": { + "html": { + "type": "string", + "value": "" + }, + "json": { + "type": "Array", + "value": [] + }, + "markdown": { + "type": "string", + "value": "" + }, + "text": { + "type": "string", + "value": "" + } + }, + "setups": { + "doc": { + "output_format": "json", + "preprocess": [ + "main_content" + ], + "suffix": [ + "doc" + ] + }, + "docx": { + "output_format": "json", + "preprocess": [ + "main_content" + ], + "suffix": [ + "docx" + ], + "vlm": {} + }, + "email": { + "fields": [ + "from", + "to", + "cc", + "bcc", + "date", + "subject", + "body", + "attachments" + ], + "output_format": "text", + "preprocess": [ + "main_content" + ], + "suffix": [ + "eml", + "msg" + ] + }, + "html": { + "output_format": "json", + "preprocess": [ + "main_content" + ], + "suffix": [ + "htm", + "html" + ] + }, + "image": { + "output_format": "text", + "parse_method": "ocr", + "preprocess": [ + "main_content" + ], + "suffix": [ + "jpg", + "jpeg", + "png", + "gif" + ] + }, + "markdown": { + "output_format": "json", + "preprocess": [ + "main_content" + ], + "suffix": [ + "md", + "markdown", + "mdx" + ], + "vlm": {} + }, + "pdf": { + "output_format": "json", + "parse_method": "DeepDOC", + "preprocess": [ + "main_content" + ], + "suffix": [ + "pdf" + ], + "vlm": {} + }, + "slides": { + "output_format": "json", + "parse_method": "DeepDOC", + "preprocess": [ + "main_content" + ], + "suffix": [ + "pptx", + "ppt" + ] + }, + "spreadsheet": { + "output_format": "html", + "parse_method": "DeepDOC", + "preprocess": [ + "main_content" + ], + "suffix": [ + "xls", + "xlsx", + "csv" + ], + "vlm": {} + }, + "text&code": { + "output_format": "json", + "preprocess": [ + "main_content" + ], + "suffix": [ + "txt", + "py", + "js", + "java", + "c", + "cpp", + "h", + "php", + "go", + "ts", + "sh", + "cs", + "kt", + "sql" + ] + } + } + } + }, + "upstream": [ + "File" + ] + }, + "TitleChunker:SlimyPapayasAttend": { + "downstream": [ + "Tokenizer:RipeSchoolsFetch" + ], + "obj": { + "component_name": "TitleChunker", + "params": { + "hierarchy": 2, + "include_heading_content": false, + "levels": [ + [ + "^#[^#]", + "^##[^#]", + "^###[^#]", + "^####[^#]" + ], + [ + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+(\u5206?\u7f16|\u90e8\u5206)", + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u7ae0", + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u8282", + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u6761", + "[\\(\uff08][\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[\\)\uff09]" + ], + [ + "\u7b2c[0-9]+\u7ae0", + "\u7b2c[0-9]+\u8282", + "[0-9]{1,2}[\\. \u3001]", + "[0-9]{1,2}\\.[0-9]{1,2}($|[^a-zA-Z/%~.-])", + "[0-9]{1,2}\\.[0-9]{1,2}\\.[0-9]{1,2}" + ], + [ + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u7ae0", + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u8282", + "[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[ \u3001]", + "[\\(\uff08][\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[\\)\uff09]", + "[\\(\uff08][0-9]{,2}[\\)\uff09]" + ], + [ + "PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)", + "Chapter (I+V?|VI*|XI|IX|X)", + "Section [0-9]+", + "Article [0-9]+" + ] + ], + "method": "hierarchy" + } + }, + "upstream": [ + "Parser:HipSignsRhyme" + ] + }, + "Tokenizer:RipeSchoolsFetch": { + "downstream": [], + "obj": { + "component_name": "Tokenizer", + "params": { + "fields": "text", + "filename_embd_weight": 0.1, + "outputs": {}, + "search_method": [ + "embedding", + "full_text" + ] + } + }, + "upstream": [ + "TitleChunker:SlimyPapayasAttend" + ] + } + }, + "globals": { + "sys.history": [] + }, + "graph": { + "edges": [ + { + "id": "xy-edge__Filestart-Parser:HipSignsRhymeend", + "source": "File", + "sourceHandle": "start", + "target": "Parser:HipSignsRhyme", + "targetHandle": "end" + }, + { + "data": { + "isHovered": false + }, + "id": "xy-edge__Parser:HipSignsRhymestart-TitleChunker:SlimyPapayasAttendend", + "source": "Parser:HipSignsRhyme", + "sourceHandle": "start", + "target": "TitleChunker:SlimyPapayasAttend", + "targetHandle": "end" + }, + { + "data": { + "isHovered": false + }, + "id": "xy-edge__TitleChunker:SlimyPapayasAttendstart-Tokenizer:RipeSchoolsFetchend", + "source": "TitleChunker:SlimyPapayasAttend", + "sourceHandle": "start", + "target": "Tokenizer:RipeSchoolsFetch", + "targetHandle": "end" + } + ], + "nodes": [ + { + "data": { + "label": "File", + "name": "File" + }, + "id": "File", + "measured": { + "height": 49, + "width": 200 + }, + "position": { + "x": 50, + "y": 200 + }, + "sourcePosition": "left", + "targetPosition": "right", + "type": "beginNode" + }, + { + "data": { + "form": { + "outputs": { + "html": { + "type": "string", + "value": "" + }, + "json": { + "type": "Array", + "value": [] + }, + "markdown": { + "type": "string", + "value": "" + }, + "text": { + "type": "string", + "value": "" + } + }, + "setups": [ + { + "fileFormat": "pdf", + "output_format": "json", + "parse_method": "DeepDOC", + "preprocess": [ + "main_content" + ], + "vlm": {} + }, + { + "fileFormat": "spreadsheet", + "output_format": "html", + "parse_method": "DeepDOC", + "preprocess": [ + "main_content" + ], + "vlm": {} + }, + { + "fileFormat": "image", + "output_format": "text", + "parse_method": "ocr", + "preprocess": [ + "main_content" + ] + }, + { + "fields": [ + "from", + "to", + "cc", + "bcc", + "date", + "subject", + "body", + "attachments" + ], + "fileFormat": "email", + "output_format": "text", + "preprocess": [ + "main_content" + ] + }, + { + "fileFormat": "markdown", + "output_format": "json", + "preprocess": [ + "main_content" + ], + "vlm": {} + }, + { + "fileFormat": "text&code", + "output_format": "json", + "preprocess": [ + "main_content" + ] + }, + { + "fileFormat": "html", + "output_format": "json", + "preprocess": [ + "main_content" + ] + }, + { + "fileFormat": "doc", + "output_format": "json", + "preprocess": [ + "main_content" + ] + }, + { + "fileFormat": "docx", + "output_format": "json", + "preprocess": [ + "main_content" + ], + "vlm": {} + }, + { + "fileFormat": "slides", + "output_format": "json", + "parse_method": "DeepDOC", + "preprocess": [ + "main_content" + ] + } + ] + }, + "label": "Parser", + "name": "Parser_0" + }, + "dragging": false, + "id": "Parser:HipSignsRhyme", + "measured": { + "height": 57, + "width": 200 + }, + "position": { + "x": 316.99524094206413, + "y": 195.39629819663406 + }, + "selected": true, + "sourcePosition": "right", + "targetPosition": "left", + "type": "parserNode" + }, + { + "data": { + "form": { + "hierarchy": "2", + "include_heading_content": false, + "method": "hierarchy", + "outputs": { + "chunks": { + "type": "Array", + "value": [] + } + }, + "rules": [ + { + "levels": [ + { + "expression": "^#[^#]" + }, + { + "expression": "^##[^#]" + }, + { + "expression": "^###[^#]" + }, + { + "expression": "^####[^#]" + } + ] + }, + { + "levels": [ + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+(\u5206?\u7f16|\u90e8\u5206)" + }, + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u7ae0" + }, + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u8282" + }, + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u6761" + }, + { + "expression": "[\\(\uff08][\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[\\)\uff09]" + } + ] + }, + { + "levels": [ + { + "expression": "\u7b2c[0-9]+\u7ae0" + }, + { + "expression": "\u7b2c[0-9]+\u8282" + }, + { + "expression": "[0-9]{1,2}[\\. \u3001]" + }, + { + "expression": "[0-9]{1,2}\\.[0-9]{1,2}($|[^a-zA-Z/%~.-])" + }, + { + "expression": "[0-9]{1,2}\\.[0-9]{1,2}\\.[0-9]{1,2}" + } + ] + }, + { + "levels": [ + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u7ae0" + }, + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u8282" + }, + { + "expression": "[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[ \u3001]" + }, + { + "expression": "[\\(\uff08][\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[\\)\uff09]" + }, + { + "expression": "[\\(\uff08][0-9]{,2}[\\)\uff09]" + } + ] + }, + { + "levels": [ + { + "expression": "PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)" + }, + { + "expression": "Chapter (I+V?|VI*|XI|IX|X)" + }, + { + "expression": "Section [0-9]+" + }, + { + "expression": "Article [0-9]+" + } + ] + } + ] + }, + "label": "TitleChunker", + "name": "Title Chunker_0" + }, + "id": "TitleChunker:SlimyPapayasAttend", + "measured": { + "height": 74, + "width": 200 + }, + "position": { + "x": 616.9952409420641, + "y": 195.39629819663406 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "chunkerNode" + }, + { + "data": { + "form": { + "fields": "text", + "filename_embd_weight": 0.1, + "outputs": {}, + "search_method": [ + "embedding", + "full_text" + ] + }, + "label": "Tokenizer", + "name": "Indexer_0" + }, + "id": "Tokenizer:RipeSchoolsFetch", + "measured": { + "height": 114, + "width": 200 + }, + "position": { + "x": 916.9952409420641, + "y": 195.39629819663406 + }, + "sourcePosition": "right", + "targetPosition": "left", + "type": "tokenizerNode" + } + ] + }, + "history": [], + "messages": [], + "path": [], + "retrieval": [], + "variables": [] + }, + "avatar": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADAAAAAwCAYAAABXAvmHAAAACXBIWXMAABYlAAAWJQFJUiTwAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAABMaSURBVHgBbVprjF3Vdf72Puc+5s7Lnhm/x/ZgCHWAFLdVKEqogJY2ikIJEUSBhiZFIaVRKZj+KFHa1G7apBS1KlGpkh99EEUgykMiKlFpFCUIyS1pSTFCQLADHmNjjz3jec99nXP27rfW3ufOtZOxru/cc8/Zez2+9a3HHoMLfv7skjuuy/Li4wbmZn6c8t7rdWMMnHOw1kKuyWf56f9dfuSe8rO8y/fyjFyXn/7fy+/llSRJeNbyWW5pZQ2+jOzvcbiW1A67wv3FwelHp/vl7e18cOr3NrTQOcCb9/cLVQpcblIUxfrDZl0B2UffRSlVQp5z5ylXGqEnbFSg/935gsIn4bmoPC8i4T+qw//xMJ8URRZ7CqjwvvVDY5J9pcVE0J4l5Tbjoxg+COc8wtdiUW6a2AuEkZco7HrrqTAXeKp8uai0tWKsILzjq5Kk+rvhtcRENbw/nAPXixK6Yst1DvChfaUAsplYqTSzRxGFFeFsUEldbPW65b1BIYuAOLm5vGZU6XK9fkiWgpfXre69rlgqcKMXfbhB1ynkszH7qkgP6DpfnLptiiIc+xlXRryLRZwzUSgfrBE3UGGCsYKgIkwvNoJFXQ+O8nLhnd7xbh3Fcm+aprqn/B7ihEpH+NJMVM6qBxROGhv8nNrr6R97oMR4vwLBwtS4KGXyKoKJiyo+uai8yz9ZmDsHTIoBihA/Vr9yEXrBa/3wRLAvCnlAl0j5lqvysk4JOlnDYj3WqCJcbm5Oucw+9LmzDFoIbHxF9lPLCkxUEB/CIeDZB6bgPy6G7rJHa1UepZUq0BvTmkc3p7I0XrfNbQuiV7DM640hoFI3PYsHAnJBaBNgozKlia7psG7kQgnGfTylCL3ADewQAtCaCq3iVFi57n0ZxxSKgeW4YLaWYHm+wMQu4PIbEuy5po6pq6sY2lxFbUiYhJtXJOC76sW800a3VaC1lGH+WAevfx+YfsngnVcdOh1gYrMoI1okEQnB43mWhz0FlqqZwInXvZ9KSyT6qK08ldBcOYOpYgOG1SCBj8kKFcyeLDC2HfitL6b41Ts3YnCsyuclaMUTEqxpQIawTyYWr/P5nJasoz7kUB8BxnYWeN+vZ7ylzRubOPGywXNfK/Dysx5jVKQ2ZHpxkiZJdEjwgigpwSxBzyD+Hd+fUKzC2vQeEOsrZBKDueMeOy73+My/jGPylwaVZfIOnxNLwcYArfHBNCakAM3CdSKfFxSgUHgqMn0WmAti2Sbh1ETW9njij7p44Z8Mtl7MjQuD3NEDgkFCz0aCSAQRjBfzJ7tu8/38LIuLNW0P38TumlGP7P+Pcey+aphYLoKwIqCuWGbeCoUSxqgqjeqLMIKwmI15hEIrNjTKhXWohFKjC5jPm6gNLmPpdII//UAbI42UMPS9xBYBQcNW4POuGGZd+BC8gS4Fe0nF4MxJh6tur+Dvz+3C9l+s00KF4tBqLhAz2kiGRtlIc7FxCGGWR5bxwUO6vHjIqjVBkoCv8zKvoaoGSNJBenUbGmMpHjlXR23MIWtCvR1ShlcP5HmHy9DQJfbLNG5iGqd3cOJoji+9MIpbH96C9mre5yWjTlcLK3UmgszoCeFNocGcG7XVo/QhBchDHhBgm1TjAvSUsXUuMUAv1PhkVVJeVHgcnZUBHHi1hsYWmqMLJQUJYMn6gXByiYP1pJQkAcc2yXH6bY+HT2zC7g+O0OqZJI31hCWL9FQxIT0pZYQrmhlc4H7hdNqL75n+rt4RZeQVadkrxGp8Mg0wdMryvDSK9nJQIpfMzF0phvyvDAlFZn+1KfydOJx6exUPvTNAOqxrginpUwJTaBa+QAB/zF8avEmkuBjLNigCTUBiUXmGQUxKVY+YbogPUUrjqIjJi+ukkn8SVdRWGXOrA/jq0RSnT9gASGI/JUTSpBo8UP6kaQ0nj5/CAz8YwshWWi4vgj1NSaX0DuPTVEJG1LRvArVJUJbcDR/hHmsmX8a5QFUh5VRgR3ZSr/iuKufFMLoW73Oh9lRMpKM0bB33fzfFwntSQlQDCRRFTHpRifbaKq65NcHe62twmVjcoxfWQia1HN3/ZoJ7ja6uZqqQjym+lFxVtSYKG12EsqI1amWtVK1Rz3ifa5wInuV3Lx6PPtPiORQQvD6GvR9hsvwVq/obk4ZCslf703ULi0v43GMJceeDlWghNZaEaIMlwWND8Ic3onP8MnS+OwE7UESRgyAmVnaa9MI+uk6onxxClZwHWq1KVoSSRcgLwWCI0AtwsiGe4nveGcPvfstg4WQwqhBOWtJnc6GN2/+S6bnDJCG1h9zilkk0tUCRtEz97Xk8Pd3G8OvL+MgHh9Tlap+kiAFMjs9imS0QkDRQkYDshAAVtDDJoc79/nqNXmzAjo7x+Qz1O5cgpKV0K0nLB8MZG+GictYwfkkVu65sY+1cTPbhC2BluYXr/jBRo5sB1i4NWm6IbFOZ0x5AIr+5N8dH3RFcu4MiTS2jLIWz13IUR8gUL5HTc6+ZVZKUm8/R+ocM2ROb0f7HDXAzRulVY0NMVyO77RyDHaoBfQSgHtLE63WdkrYlfopsENfur6C1EAwfPZDgihtaqDQqrCgrWLyXbDTWgFmqo3LTGmrXnIVrT6DxKSaZD+/hQh0kO7h4h0ueoIBP8769m+iwFtIhMsQVThMP2ty8VdUaCBsp1/wqksk8JLm61yDUGHL1YEUtF6R65B0nqMQa8wLvSd/Hmollp2ed5fIBXPnJJTx2l0ddiEcy8dKcwyfvJKF1+fACrfpTJpbLKrDDQzDkYVTP8Muz3HAUlUlCB51o5Rqtx70Z3KgSTiMENStU1yT9DZP5zwSaLLot3joA985GZPMrsFe1lNkMS2tTSbRg1PpLvHaOBnl8mEmaMBwkfCfG0P7PWaQ3riK9lFDM2IvVath2aYHOWgzi1lqGi65OlGmKFaJqwCvmpW5xJ+vIn98KP78RrecXsPild9H+9jA6j04qBTYfYj0yxCTVIqZrxPvRceSPjCP7cYLOE23kSRN+cQEFGc4WTFJHqfFZ0uM4PdIlVbeoXB7YSj43H+Q64/x+iJ9ZelvmjfTGK+AObaMsbHZMoeX2jl9mvHZDjkGNTcXoFqkMCix9jiuPtAiZDvKVReQnV5AdGoQ7shHFu6TQt7jpm6S9I07rFzfIjZjn3fwZZEvzsFwLW1kWMJvaBkOSybForwlHI1+ch1tqo5ghxCq55kLDfZyGboLuczTGFqv5xxF+tVt4f5fB/cpRXk+QvUg4VmU/i0t+zbC8EQ8wG6YSTLSe9Jm+0QrU2WqH9q5FBWbm4Igiu0XqfRfoT8KnWEb3++eo5CotTAaiIvnyIpVaQcJEiPdzHSEFF5KTN02kt88i2dcOI5p2F4WYX5NhB8Xb9KRlXHQpzy8Q95u4z5UrVIJ7Dg+gOEuSyGSa0ozVrQmdh7R1RlxT5YXrFlH83wjSDXGKQLkGPk+B9tAb38tC+8fOyjpudsZg4sU65m+it5hDLC2bvr/A0KcyureKwU/XsPoAE15DWIV7NBh0O0KKwTDTVJUlA+PDJbEIIft4ZvmCcPT/W0XruYwC0rugTG+cRn6Ke5/LVTHJF1IBSAeNTsvp5MGteIwfaGHmY4Mx6TAWWBvVrpqhcTfpgwVd6+n2bG0O1VzqGEKCyhta1Fa5wYzIRwt1aZBkBe3DVHa8huomQmZMatY1LZ+zmSY6nO7U5hhJZ1mHfrTNzy0tVQRCtcmthNgy8qECFRF4fB5VNlF2AzSTv/tKjsqADhbYrGRJgIYMpyQ1DBKXnSxMJ5h4Cr/C9zmt31Uxuth3GaCLzBcp7006yn6av4tQWkhPI6ySWyojGUzY5mzIzKJ0MsHnzzaVJs0EKZuUbHc4rY/ABqbzkzkM3gts+HNS9LWDKN4cw+Bt/J6VsWTs4y8TahVSbMYFOq0EK7MOg6NpUEA5N9dJgF8KDbSrrdGSpL+V4ZBsBIPLRtO8tWSrLgMwJ94W6fRXKSyhVv8AP9cC3+fzC8wRG7D85RFSLK9/aE55X/KFXWshf9dj+L4aZm9hgtxtUB1vYPFuqVAp8KAQAjD72Rzj3yIdM1n+9MccJmyXYk/qbGaEk69IH+C0bExHwzApW1lCZdRg9cuXYvUru6kA+b8IjY2tMXxcNXQGjULHJfnCImG3htW/IX4fMRpi9YtZGHZYPmcM8MU5ZCcW0ZleQTrFpMkmqXN6Bu2ZU8jfY098kUXjrgbzRY5sgcw1wudG2wyNDtpvNVH/zdB9vPF8V9vMnHuS5XIMb/T40TMDSOqxq9oiKZva89UkPeYn1tB9nYsRw55VanduHt1zM+i8tSLRiIGbqsinGYz0QLG8yoXPoZhd1e5q8H4q80YHnRmuwXqrOMWAn2a9VR2Af69AdqqF7hnCrE64tXOMfmEQm57agOoV5HnXQpEwe+9rYvxpzpFuJ4ngFJ56IMGGTVYJJfVW23b86Nk67vrXeQqQYPR+BvJVzMZ7ibGBFNkiLXySw6h6Fdm79FLKYGMKrjJJOTYXw5/eSGWXsPJkUz1XmRrEwC0VrcEqmxrYfsig9b22xk5lNwP3QyPaju48MsEYypU65bNrGe17k51VjH+NuUTgIx2ddGhNBn97Ecff3IFT73SxeZsJY8t7dn/Cy3zl7HsGX32xicnLCRG2lORBrD7J4Fqp090G9RsYuCkZaI3ZsUFGkdGbVJorYT5qBya08gzkRmxnQrdhhqOznKrT6tUwi0pTZmUWKgFsE633VRiUg0YTSEEqPu3g5hW6tdEE909uRqPO/JE77Y1TmdNI1pzYZvHsQzX88dMdjlGUXTH0GVIgs7OXEqAttQvdNsDsm8vMk4tnTvldq8XOLJlEMtxGhBrRxQTldH6jVW457BWl8kJrYR9LZc0vvtRAUjQfKGaVIArOhqrDBZ55YAK+SdarlCP+MJlRHvekukPfSTB7rBYsJZVhu07XyVDT9pr28J1Z3zj0mvqbNi3ClcWMDqpCH1kJgwgbuiw5vNB3W1HYGCvUnMR2knRsyELFe3xmVntmR0Ml9N7xlzfh3/+WcbDBhi6tPLu4e/dv+3I2L8YsWN9849gSOsuxK4oNj5YVYSxRdvKhrkec8dv13tqqpcNMSEoOJ7WHvvrH7OGl4xcd+BZxkFaeIwSDWXqiubwV93CUObmHeMnXzxaw3tAEDJL2sTTfwTNfGUGN5XARRx9hcp2tN8cWsdU0ejpTCu/j8ZPTd6tDrzDxliZmVV9MHrxR3ltctxuE13lmGsUxITdYozmgm+/CfuaFyT0IBaI9/yTIlick2pny95Ex4Nt/1cUP/nmCg9jQcSHCpsQ14kymHK+XjZQJg1VVSC0kQy8bsjNC4RuGYOWYMHpOJ0s2Ch8hadjWrs7vwhcmOti2KxyIlOdz60dS8fzgvPEib9zGbuvv7l7DoSfHGPmpWlmtKzjtwcWFXjXOs1x5nGTCJFvrKHlOsRbGgWH86OKcqZxkmHh0EkoQySXV4XG8yqHBvbta2Dkl4/Vg5DDm9+cdT6nk/SePckPOVL2bDz702Sa++fvDqG8YZeYLxz/heChbh1KRx+MkH87ElP6CYOKR0u4hiONJjonHqAFwMW5yJIMkDTeJB1nrf/OODmFDGOcXHGn1/Zzngf6jVR2J88Ht2y0OkVbv2EoWeG0CtZEG+wYXBDExEJM45NVhfpxOqLfitDlu6uMYMdBuPIlh4EoZUx0awOyJSXzj1gYh08UCx/gbNzmNof7J4XnHX/Ga+fzOjx2jAlMXnpP1/yTc9PSMxIfFJ+6p4Dfu7HByF4a2RbfQDkkSlMjv4sFGSEZGPSOFn01lEhEn1aTOrF3F6TfqePmpKv7rcYf50w6bthht3vvZr1+W/jPr+PmwuXvqpocZTPdpIpEpQZ+25an6uubkjhWDRTb+W5mdJy9OcNm1wJ4rqdxmh6Ex1u41E6bIjqc8rJvaqyzaWJedOcoDErakM0cSHPsf/n7SabW7YQJ6jlYeHQHnH7/2jz4vPKLl50fNH0zeeB3d+sPeGXHfue3Pw14/A8hkrLPKcrwjR0m836WKnCSOyKUMlya8SkapsdlJGUfVmnRiYVyox6fW9M4lfCwngggmkFyk6PKwvPxrgbC/v0i/vWfq5od5NnBfUSYqH2ecfVr/vL9x6FkG0q8k8ShUYJ7G46UQJjaumejELfxuje0dbocgNnF+2r9vaPZ1vBgR0Qf1rz84/fh+BRu3O0gMH05NEvNgSfP+PDf2w6o/qGTzXIdUIQY4CeIJYhZO2TW7h3O03Mv1XGdCPl5DzOQuFnQlZNEL9aKnzPofkrjDdVQPanzKfy8t/qT94Y17/43pm1Mse3UYnOK8aD8/Ftb/aOPCv4HQQ27JpElpufL0PemdQ2sxF1mqnFgrw8bEmJh+9oFO8kIO0Cz99QHbvvPg9BPrf+zR/7N/6uap3JmD9N2VXHKfFnsuTJh7R1F97/0e6o+Tdag5hY4QRJpUNCsnNhyb2pgvUAQFbYn7+Hyyvs40vfYdDoyffXD6iRf69/p/CbMWUUVYM2EAAAAASUVORK5CYII=" +} diff --git a/agent/templates/ingestion_pipeline_Manual.json b/agent/templates/ingestion_pipeline_Manual.json new file mode 100644 index 0000000000..46bc02ba6d --- /dev/null +++ b/agent/templates/ingestion_pipeline_Manual.json @@ -0,0 +1,552 @@ +{ + "id": 35, + "title": { + "en": "Manual", + "de": "Handbuch", + "zh": "手册" + }, + "description": { + "en": "This template segments parsed files based on the structure of a manual. It is suitable for documents with clearly defined sections and operational guidance, such as product manuals, user guides, installation instructions, and technical documentation.", + "de": "Diese Vorlage segmentiert die geparste Datei anhand der Struktur eines Handbuchs. Sie eignet sich für Dokumente mit klar definierten Abschnitten und Handlungsanweisungen, wie Produkthandbücher, Benutzerhandbücher, Installationsanleitungen und technische Dokumentationen.", + "zh": "此模板将解析后的文件按手册结构进行切片,适用于具有清晰章节层级和操作说明的文档类型,如产品手册、用户指南、安装说明和技术文档。" + }, + "canvas_type": "Ingestion Pipeline", + "canvas_category": "dataflow_canvas", + "dsl": { + "components": { + "File": { + "downstream": [ + "Parser:HipSignsRhyme" + ], + "obj": { + "component_name": "File", + "params": {} + }, + "upstream": [] + }, + "Parser:HipSignsRhyme": { + "downstream": [ + "TitleChunker:WideSwansSmoke" + ], + "obj": { + "component_name": "Parser", + "params": { + "outputs": { + "html": { + "type": "string", + "value": "" + }, + "json": { + "type": "Array", + "value": [] + }, + "markdown": { + "type": "string", + "value": "" + }, + "text": { + "type": "string", + "value": "" + } + }, + "setups": { + "doc": { + "output_format": "json", + "preprocess": "main_content", + "suffix": [ + "doc" + ] + }, + "docx": { + "output_format": "json", + "preprocess": "main_content", + "suffix": [ + "docx" + ], + "vlm": {} + }, + "email": { + "fields": [ + "from", + "to", + "cc", + "bcc", + "date", + "subject", + "body", + "attachments" + ], + "output_format": "text", + "preprocess": "main_content", + "suffix": [ + "eml", + "msg" + ] + }, + "html": { + "output_format": "json", + "preprocess": "main_content", + "suffix": [ + "htm", + "html" + ] + }, + "image": { + "output_format": "text", + "parse_method": "ocr", + "preprocess": "main_content", + "suffix": [ + "jpg", + "jpeg", + "png", + "gif" + ], + "system_prompt": "" + }, + "markdown": { + "output_format": "json", + "preprocess": "main_content", + "suffix": [ + "md", + "markdown", + "mdx" + ], + "vlm": {} + }, + "pdf": { + "output_format": "json", + "parse_method": "DeepDOC", + "preprocess": "main_content", + "suffix": [ + "pdf" + ], + "vlm": {} + }, + "slides": { + "output_format": "json", + "parse_method": "DeepDOC", + "preprocess": "main_content", + "suffix": [ + "pptx", + "ppt" + ] + }, + "spreadsheet": { + "output_format": "html", + "parse_method": "DeepDOC", + "preprocess": "main_content", + "suffix": [ + "xls", + "xlsx", + "csv" + ], + "vlm": {} + }, + "text&code": { + "output_format": "json", + "preprocess": "main_content", + "suffix": [ + "txt", + "py", + "js", + "java", + "c", + "cpp", + "h", + "php", + "go", + "ts", + "sh", + "cs", + "kt", + "sql" + ] + } + } + } + }, + "upstream": [ + "File" + ] + }, + "TitleChunker:WideSwansSmoke": { + "downstream": [ + "Tokenizer:FloppyCrewsRelax" + ], + "obj": { + "component_name": "TitleChunker", + "params": { + "hierarchy": 0, + "include_heading_content": false, + "levels": [ + [ + "^#[^#]", + "^##[^#]", + "^###[^#]", + "^####[^#]" + ], + [ + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+(\u5206?\u7f16|\u90e8\u5206)", + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u7ae0", + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u8282", + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u6761", + "[\\(\uff08][\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[\\)\uff09]" + ], + [ + "\u7b2c[0-9]+\u7ae0", + "\u7b2c[0-9]+\u8282", + "[0-9]{1,2}[\\. \u3001]", + "[0-9]{1,2}\\.[0-9]{1,2}($|[^a-zA-Z/%~.-])", + "[0-9]{1,2}\\.[0-9]{1,2}\\.[0-9]{1,2}" + ], + [ + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u7ae0", + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u8282", + "[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[ \u3001]", + "[\\(\uff08][\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[\\)\uff09]", + "[\\(\uff08][0-9]{,2}[\\)\uff09]" + ], + [ + "PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)", + "Chapter (I+V?|VI*|XI|IX|X)", + "Section [0-9]+", + "Article [0-9]+" + ] + ], + "method": "group" + } + }, + "upstream": [ + "Parser:HipSignsRhyme" + ] + }, + "Tokenizer:FloppyCrewsRelax": { + "downstream": [], + "obj": { + "component_name": "Tokenizer", + "params": { + "fields": "text", + "filename_embd_weight": 0.1, + "outputs": {}, + "search_method": [ + "embedding", + "full_text" + ] + } + }, + "upstream": [ + "TitleChunker:WideSwansSmoke" + ] + } + }, + "globals": { + "sys.history": [] + }, + "graph": { + "edges": [ + { + "id": "xy-edge__Filestart-Parser:HipSignsRhymeend", + "source": "File", + "sourceHandle": "start", + "target": "Parser:HipSignsRhyme", + "targetHandle": "end" + }, + { + "id": "xy-edge__Parser:HipSignsRhymestart-TitleChunker:WideSwansSmokeend", + "source": "Parser:HipSignsRhyme", + "sourceHandle": "start", + "target": "TitleChunker:WideSwansSmoke", + "targetHandle": "end" + }, + { + "id": "xy-edge__TitleChunker:WideSwansSmokestart-Tokenizer:FloppyCrewsRelaxend", + "source": "TitleChunker:WideSwansSmoke", + "sourceHandle": "start", + "target": "Tokenizer:FloppyCrewsRelax", + "targetHandle": "end" + } + ], + "nodes": [ + { + "data": { + "label": "File", + "name": "File" + }, + "id": "File", + "measured": { + "height": 49, + "width": 200 + }, + "position": { + "x": 50, + "y": 200 + }, + "sourcePosition": "left", + "targetPosition": "right", + "type": "beginNode" + }, + { + "data": { + "form": { + "outputs": { + "html": { + "type": "string", + "value": "" + }, + "json": { + "type": "Array", + "value": [] + }, + "markdown": { + "type": "string", + "value": "" + }, + "text": { + "type": "string", + "value": "" + } + }, + "setups": [ + { + "fileFormat": "pdf", + "output_format": "json", + "parse_method": "DeepDOC", + "preprocess": "main_content" + }, + { + "fileFormat": "spreadsheet", + "output_format": "html", + "parse_method": "DeepDOC", + "preprocess": "main_content" + }, + { + "fileFormat": "image", + "output_format": "text", + "parse_method": "ocr", + "preprocess": "main_content", + "system_prompt": "" + }, + { + "fields": [ + "from", + "to", + "cc", + "bcc", + "date", + "subject", + "body", + "attachments" + ], + "fileFormat": "email", + "output_format": "text", + "preprocess": "main_content" + }, + { + "fileFormat": "markdown", + "output_format": "json", + "preprocess": "main_content" + }, + { + "fileFormat": "text&code", + "output_format": "json", + "preprocess": "main_content" + }, + { + "fileFormat": "html", + "output_format": "json", + "preprocess": "main_content" + }, + { + "fileFormat": "doc", + "output_format": "json", + "preprocess": "main_content" + }, + { + "fileFormat": "docx", + "output_format": "json", + "preprocess": "main_content" + }, + { + "fileFormat": "slides", + "output_format": "json", + "parse_method": "DeepDOC", + "preprocess": "main_content" + } + ] + }, + "label": "Parser", + "name": "Parser_0" + }, + "dragging": false, + "id": "Parser:HipSignsRhyme", + "measured": { + "height": 57, + "width": 200 + }, + "position": { + "x": 316.99524094206413, + "y": 195.39629819663406 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "parserNode" + }, + { + "data": { + "form": { + "hierarchy": "0", + "include_heading_content": false, + "method": "group", + "outputs": { + "chunks": { + "type": "Array", + "value": [] + } + }, + "rules": [ + { + "levels": [ + { + "expression": "^#[^#]" + }, + { + "expression": "^##[^#]" + }, + { + "expression": "^###[^#]" + }, + { + "expression": "^####[^#]" + } + ] + }, + { + "levels": [ + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+(\u5206?\u7f16|\u90e8\u5206)" + }, + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u7ae0" + }, + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u8282" + }, + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u6761" + }, + { + "expression": "[\\(\uff08][\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[\\)\uff09]" + } + ] + }, + { + "levels": [ + { + "expression": "\u7b2c[0-9]+\u7ae0" + }, + { + "expression": "\u7b2c[0-9]+\u8282" + }, + { + "expression": "[0-9]{1,2}[\\. \u3001]" + }, + { + "expression": "[0-9]{1,2}\\.[0-9]{1,2}($|[^a-zA-Z/%~.-])" + }, + { + "expression": "[0-9]{1,2}\\.[0-9]{1,2}\\.[0-9]{1,2}" + } + ] + }, + { + "levels": [ + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u7ae0" + }, + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u8282" + }, + { + "expression": "[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[ \u3001]" + }, + { + "expression": "[\\(\uff08][\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[\\)\uff09]" + }, + { + "expression": "[\\(\uff08][0-9]{,2}[\\)\uff09]" + } + ] + }, + { + "levels": [ + { + "expression": "PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)" + }, + { + "expression": "Chapter (I+V?|VI*|XI|IX|X)" + }, + { + "expression": "Section [0-9]+" + }, + { + "expression": "Article [0-9]+" + } + ] + } + ] + }, + "label": "TitleChunker", + "name": "Title Chunker_0" + }, + "id": "TitleChunker:WideSwansSmoke", + "measured": { + "height": 73, + "width": 200 + }, + "position": { + "x": 616.9952409420641, + "y": 195.39629819663406 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "chunkerNode" + }, + { + "data": { + "form": { + "fields": "text", + "filename_embd_weight": 0.1, + "outputs": {}, + "search_method": [ + "embedding", + "full_text" + ] + }, + "label": "Tokenizer", + "name": "Indexer_0" + }, + "id": "Tokenizer:FloppyCrewsRelax", + "measured": { + "height": 113, + "width": 200 + }, + "position": { + "x": 916.9952409420641, + "y": 195.39629819663406 + }, + "sourcePosition": "right", + "targetPosition": "left", + "type": "tokenizerNode" + } + ] + }, + "history": [], + "messages": [], + "path": [], + "retrieval": [], + "variables": [] + }, + "avatar": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADAAAAAwCAYAAABXAvmHAAAACXBIWXMAABYlAAAWJQFJUiTwAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAABMaSURBVHgBbVprjF3Vdf72Puc+5s7Lnhm/x/ZgCHWAFLdVKEqogJY2ikIJEUSBhiZFIaVRKZj+KFHa1G7apBS1KlGpkh99EEUgykMiKlFpFCUIyS1pSTFCQLADHmNjjz3jec99nXP27rfW3ufOtZOxru/cc8/Zez2+9a3HHoMLfv7skjuuy/Li4wbmZn6c8t7rdWMMnHOw1kKuyWf56f9dfuSe8rO8y/fyjFyXn/7fy+/llSRJeNbyWW5pZQ2+jOzvcbiW1A67wv3FwelHp/vl7e18cOr3NrTQOcCb9/cLVQpcblIUxfrDZl0B2UffRSlVQp5z5ylXGqEnbFSg/935gsIn4bmoPC8i4T+qw//xMJ8URRZ7CqjwvvVDY5J9pcVE0J4l5Tbjoxg+COc8wtdiUW6a2AuEkZco7HrrqTAXeKp8uai0tWKsILzjq5Kk+rvhtcRENbw/nAPXixK6Yst1DvChfaUAsplYqTSzRxGFFeFsUEldbPW65b1BIYuAOLm5vGZU6XK9fkiWgpfXre69rlgqcKMXfbhB1ynkszH7qkgP6DpfnLptiiIc+xlXRryLRZwzUSgfrBE3UGGCsYKgIkwvNoJFXQ+O8nLhnd7xbh3Fcm+aprqn/B7ihEpH+NJMVM6qBxROGhv8nNrr6R97oMR4vwLBwtS4KGXyKoKJiyo+uai8yz9ZmDsHTIoBihA/Vr9yEXrBa/3wRLAvCnlAl0j5lqvysk4JOlnDYj3WqCJcbm5Oucw+9LmzDFoIbHxF9lPLCkxUEB/CIeDZB6bgPy6G7rJHa1UepZUq0BvTmkc3p7I0XrfNbQuiV7DM640hoFI3PYsHAnJBaBNgozKlia7psG7kQgnGfTylCL3ADewQAtCaCq3iVFi57n0ZxxSKgeW4YLaWYHm+wMQu4PIbEuy5po6pq6sY2lxFbUiYhJtXJOC76sW800a3VaC1lGH+WAevfx+YfsngnVcdOh1gYrMoI1okEQnB43mWhz0FlqqZwInXvZ9KSyT6qK08ldBcOYOpYgOG1SCBj8kKFcyeLDC2HfitL6b41Ts3YnCsyuclaMUTEqxpQIawTyYWr/P5nJasoz7kUB8BxnYWeN+vZ7ylzRubOPGywXNfK/Dysx5jVKQ2ZHpxkiZJdEjwgigpwSxBzyD+Hd+fUKzC2vQeEOsrZBKDueMeOy73+My/jGPylwaVZfIOnxNLwcYArfHBNCakAM3CdSKfFxSgUHgqMn0WmAti2Sbh1ETW9njij7p44Z8Mtl7MjQuD3NEDgkFCz0aCSAQRjBfzJ7tu8/38LIuLNW0P38TumlGP7P+Pcey+aphYLoKwIqCuWGbeCoUSxqgqjeqLMIKwmI15hEIrNjTKhXWohFKjC5jPm6gNLmPpdII//UAbI42UMPS9xBYBQcNW4POuGGZd+BC8gS4Fe0nF4MxJh6tur+Dvz+3C9l+s00KF4tBqLhAz2kiGRtlIc7FxCGGWR5bxwUO6vHjIqjVBkoCv8zKvoaoGSNJBenUbGmMpHjlXR23MIWtCvR1ShlcP5HmHy9DQJfbLNG5iGqd3cOJoji+9MIpbH96C9mre5yWjTlcLK3UmgszoCeFNocGcG7XVo/QhBchDHhBgm1TjAvSUsXUuMUAv1PhkVVJeVHgcnZUBHHi1hsYWmqMLJQUJYMn6gXByiYP1pJQkAcc2yXH6bY+HT2zC7g+O0OqZJI31hCWL9FQxIT0pZYQrmhlc4H7hdNqL75n+rt4RZeQVadkrxGp8Mg0wdMryvDSK9nJQIpfMzF0phvyvDAlFZn+1KfydOJx6exUPvTNAOqxrginpUwJTaBa+QAB/zF8avEmkuBjLNigCTUBiUXmGQUxKVY+YbogPUUrjqIjJi+ukkn8SVdRWGXOrA/jq0RSnT9gASGI/JUTSpBo8UP6kaQ0nj5/CAz8YwshWWi4vgj1NSaX0DuPTVEJG1LRvArVJUJbcDR/hHmsmX8a5QFUh5VRgR3ZSr/iuKufFMLoW73Oh9lRMpKM0bB33fzfFwntSQlQDCRRFTHpRifbaKq65NcHe62twmVjcoxfWQia1HN3/ZoJ7ja6uZqqQjym+lFxVtSYKG12EsqI1amWtVK1Rz3ifa5wInuV3Lx6PPtPiORQQvD6GvR9hsvwVq/obk4ZCslf703ULi0v43GMJceeDlWghNZaEaIMlwWND8Ic3onP8MnS+OwE7UESRgyAmVnaa9MI+uk6onxxClZwHWq1KVoSSRcgLwWCI0AtwsiGe4nveGcPvfstg4WQwqhBOWtJnc6GN2/+S6bnDJCG1h9zilkk0tUCRtEz97Xk8Pd3G8OvL+MgHh9Tlap+kiAFMjs9imS0QkDRQkYDshAAVtDDJoc79/nqNXmzAjo7x+Qz1O5cgpKV0K0nLB8MZG+GictYwfkkVu65sY+1cTPbhC2BluYXr/jBRo5sB1i4NWm6IbFOZ0x5AIr+5N8dH3RFcu4MiTS2jLIWz13IUR8gUL5HTc6+ZVZKUm8/R+ocM2ROb0f7HDXAzRulVY0NMVyO77RyDHaoBfQSgHtLE63WdkrYlfopsENfur6C1EAwfPZDgihtaqDQqrCgrWLyXbDTWgFmqo3LTGmrXnIVrT6DxKSaZD+/hQh0kO7h4h0ueoIBP8769m+iwFtIhMsQVThMP2ty8VdUaCBsp1/wqksk8JLm61yDUGHL1YEUtF6R65B0nqMQa8wLvSd/Hmollp2ed5fIBXPnJJTx2l0ddiEcy8dKcwyfvJKF1+fACrfpTJpbLKrDDQzDkYVTP8Muz3HAUlUlCB51o5Rqtx70Z3KgSTiMENStU1yT9DZP5zwSaLLot3joA985GZPMrsFe1lNkMS2tTSbRg1PpLvHaOBnl8mEmaMBwkfCfG0P7PWaQ3riK9lFDM2IvVath2aYHOWgzi1lqGi65OlGmKFaJqwCvmpW5xJ+vIn98KP78RrecXsPild9H+9jA6j04qBTYfYj0yxCTVIqZrxPvRceSPjCP7cYLOE23kSRN+cQEFGc4WTFJHqfFZ0uM4PdIlVbeoXB7YSj43H+Q64/x+iJ9ZelvmjfTGK+AObaMsbHZMoeX2jl9mvHZDjkGNTcXoFqkMCix9jiuPtAiZDvKVReQnV5AdGoQ7shHFu6TQt7jpm6S9I07rFzfIjZjn3fwZZEvzsFwLW1kWMJvaBkOSybForwlHI1+ch1tqo5ghxCq55kLDfZyGboLuczTGFqv5xxF+tVt4f5fB/cpRXk+QvUg4VmU/i0t+zbC8EQ8wG6YSTLSe9Jm+0QrU2WqH9q5FBWbm4Igiu0XqfRfoT8KnWEb3++eo5CotTAaiIvnyIpVaQcJEiPdzHSEFF5KTN02kt88i2dcOI5p2F4WYX5NhB8Xb9KRlXHQpzy8Q95u4z5UrVIJ7Dg+gOEuSyGSa0ozVrQmdh7R1RlxT5YXrFlH83wjSDXGKQLkGPk+B9tAb38tC+8fOyjpudsZg4sU65m+it5hDLC2bvr/A0KcyureKwU/XsPoAE15DWIV7NBh0O0KKwTDTVJUlA+PDJbEIIft4ZvmCcPT/W0XruYwC0rugTG+cRn6Ke5/LVTHJF1IBSAeNTsvp5MGteIwfaGHmY4Mx6TAWWBvVrpqhcTfpgwVd6+n2bG0O1VzqGEKCyhta1Fa5wYzIRwt1aZBkBe3DVHa8huomQmZMatY1LZ+zmSY6nO7U5hhJZ1mHfrTNzy0tVQRCtcmthNgy8qECFRF4fB5VNlF2AzSTv/tKjsqADhbYrGRJgIYMpyQ1DBKXnSxMJ5h4Cr/C9zmt31Uxuth3GaCLzBcp7006yn6av4tQWkhPI6ySWyojGUzY5mzIzKJ0MsHnzzaVJs0EKZuUbHc4rY/ABqbzkzkM3gts+HNS9LWDKN4cw+Bt/J6VsWTs4y8TahVSbMYFOq0EK7MOg6NpUEA5N9dJgF8KDbSrrdGSpL+V4ZBsBIPLRtO8tWSrLgMwJ94W6fRXKSyhVv8AP9cC3+fzC8wRG7D85RFSLK9/aE55X/KFXWshf9dj+L4aZm9hgtxtUB1vYPFuqVAp8KAQAjD72Rzj3yIdM1n+9MccJmyXYk/qbGaEk69IH+C0bExHwzApW1lCZdRg9cuXYvUru6kA+b8IjY2tMXxcNXQGjULHJfnCImG3htW/IX4fMRpi9YtZGHZYPmcM8MU5ZCcW0ZleQTrFpMkmqXN6Bu2ZU8jfY098kUXjrgbzRY5sgcw1wudG2wyNDtpvNVH/zdB9vPF8V9vMnHuS5XIMb/T40TMDSOqxq9oiKZva89UkPeYn1tB9nYsRw55VanduHt1zM+i8tSLRiIGbqsinGYz0QLG8yoXPoZhd1e5q8H4q80YHnRmuwXqrOMWAn2a9VR2Af69AdqqF7hnCrE64tXOMfmEQm57agOoV5HnXQpEwe+9rYvxpzpFuJ4ngFJ56IMGGTVYJJfVW23b86Nk67vrXeQqQYPR+BvJVzMZ7ibGBFNkiLXySw6h6Fdm79FLKYGMKrjJJOTYXw5/eSGWXsPJkUz1XmRrEwC0VrcEqmxrYfsig9b22xk5lNwP3QyPaju48MsEYypU65bNrGe17k51VjH+NuUTgIx2ddGhNBn97Ecff3IFT73SxeZsJY8t7dn/Cy3zl7HsGX32xicnLCRG2lORBrD7J4Fqp090G9RsYuCkZaI3ZsUFGkdGbVJorYT5qBya08gzkRmxnQrdhhqOznKrT6tUwi0pTZmUWKgFsE633VRiUg0YTSEEqPu3g5hW6tdEE909uRqPO/JE77Y1TmdNI1pzYZvHsQzX88dMdjlGUXTH0GVIgs7OXEqAttQvdNsDsm8vMk4tnTvldq8XOLJlEMtxGhBrRxQTldH6jVW457BWl8kJrYR9LZc0vvtRAUjQfKGaVIArOhqrDBZ55YAK+SdarlCP+MJlRHvekukPfSTB7rBYsJZVhu07XyVDT9pr28J1Z3zj0mvqbNi3ClcWMDqpCH1kJgwgbuiw5vNB3W1HYGCvUnMR2knRsyELFe3xmVntmR0Ml9N7xlzfh3/+WcbDBhi6tPLu4e/dv+3I2L8YsWN9849gSOsuxK4oNj5YVYSxRdvKhrkec8dv13tqqpcNMSEoOJ7WHvvrH7OGl4xcd+BZxkFaeIwSDWXqiubwV93CUObmHeMnXzxaw3tAEDJL2sTTfwTNfGUGN5XARRx9hcp2tN8cWsdU0ejpTCu/j8ZPTd6tDrzDxliZmVV9MHrxR3ltctxuE13lmGsUxITdYozmgm+/CfuaFyT0IBaI9/yTIlick2pny95Ex4Nt/1cUP/nmCg9jQcSHCpsQ14kymHK+XjZQJg1VVSC0kQy8bsjNC4RuGYOWYMHpOJ0s2Ch8hadjWrs7vwhcmOti2KxyIlOdz60dS8fzgvPEib9zGbuvv7l7DoSfHGPmpWlmtKzjtwcWFXjXOs1x5nGTCJFvrKHlOsRbGgWH86OKcqZxkmHh0EkoQySXV4XG8yqHBvbta2Dkl4/Vg5DDm9+cdT6nk/SePckPOVL2bDz702Sa++fvDqG8YZeYLxz/heChbh1KRx+MkH87ElP6CYOKR0u4hiONJjonHqAFwMW5yJIMkDTeJB1nrf/OODmFDGOcXHGn1/Zzngf6jVR2J88Ht2y0OkVbv2EoWeG0CtZEG+wYXBDExEJM45NVhfpxOqLfitDlu6uMYMdBuPIlh4EoZUx0awOyJSXzj1gYh08UCx/gbNzmNof7J4XnHX/Ga+fzOjx2jAlMXnpP1/yTc9PSMxIfFJ+6p4Dfu7HByF4a2RbfQDkkSlMjv4sFGSEZGPSOFn01lEhEn1aTOrF3F6TfqePmpKv7rcYf50w6bthht3vvZr1+W/jPr+PmwuXvqpocZTPdpIpEpQZ+25an6uubkjhWDRTb+W5mdJy9OcNm1wJ4rqdxmh6Ex1u41E6bIjqc8rJvaqyzaWJedOcoDErakM0cSHPsf/n7SabW7YQJ6jlYeHQHnH7/2jz4vPKLl50fNH0zeeB3d+sPeGXHfue3Pw14/A8hkrLPKcrwjR0m836WKnCSOyKUMlya8SkapsdlJGUfVmnRiYVyox6fW9M4lfCwngggmkFyk6PKwvPxrgbC/v0i/vWfq5od5NnBfUSYqH2ecfVr/vL9x6FkG0q8k8ShUYJ7G46UQJjaumejELfxuje0dbocgNnF+2r9vaPZ1vBgR0Qf1rz84/fh+BRu3O0gMH05NEvNgSfP+PDf2w6o/qGTzXIdUIQY4CeIJYhZO2TW7h3O03Mv1XGdCPl5DzOQuFnQlZNEL9aKnzPofkrjDdVQPanzKfy8t/qT94Y17/43pm1Mse3UYnOK8aD8/Ftb/aOPCv4HQQ27JpElpufL0PemdQ2sxF1mqnFgrw8bEmJh+9oFO8kIO0Cz99QHbvvPg9BPrf+zR/7N/6uap3JmD9N2VXHKfFnsuTJh7R1F97/0e6o+Tdag5hY4QRJpUNCsnNhyb2pgvUAQFbYn7+Hyyvs40vfYdDoyffXD6iRf69/p/CbMWUUVYM2EAAAAASUVORK5CYII=" +} diff --git a/agent/templates/ingestion_pipeline_One.json b/agent/templates/ingestion_pipeline_One.json new file mode 100644 index 0000000000..0e6a1599bc --- /dev/null +++ b/agent/templates/ingestion_pipeline_One.json @@ -0,0 +1,443 @@ +{ + "id": 36, + "title": { + "en": "One", + "de": "Ganzes Dokument", + "zh": "全文" + }, + "description": { + "en": "This template treats the entire parsed file as a single segment without further splitting. It is suitable for short documents or highly coherent texts where preserving full context is essential.", + "de": "Diese Vorlage behandelt die gesamte geparste Datei als ein einzelnes Segment, ohne sie weiter aufzuteilen. Sie eignet sich für kurze Dokumente, stark zusammenhängende Texte oder Fälle, in denen der vollständige Kontext erhalten bleiben soll.", + "zh": "此模板将解析后的文件作为一个完整片段处理,不再进一步切分,适用于篇幅较短、内容连贯性较强,或需要保留全文上下文的文档。" + }, + "canvas_type": "Ingestion Pipeline", + "canvas_category": "dataflow_canvas", + "dsl": { + "components": { + "File": { + "downstream": [ + "Parser:HipSignsRhyme" + ], + "obj": { + "component_name": "File", + "params": {} + }, + "upstream": [] + }, + "Parser:HipSignsRhyme": { + "downstream": [ + "TokenChunker:LongRadiosNotice" + ], + "obj": { + "component_name": "Parser", + "params": { + "outputs": { + "html": { + "type": "string", + "value": "" + }, + "json": { + "type": "Array", + "value": [] + }, + "markdown": { + "type": "string", + "value": "" + }, + "text": { + "type": "string", + "value": "" + } + }, + "setups": { + "doc": { + "output_format": "json", + "preprocess": "main_content", + "suffix": [ + "doc" + ] + }, + "docx": { + "output_format": "json", + "preprocess": "main_content", + "suffix": [ + "docx" + ], + "vlm": {} + }, + "email": { + "fields": [ + "from", + "to", + "cc", + "bcc", + "date", + "subject", + "body", + "attachments" + ], + "output_format": "text", + "preprocess": "main_content", + "suffix": [ + "eml", + "msg" + ] + }, + "html": { + "output_format": "json", + "preprocess": "main_content", + "suffix": [ + "htm", + "html" + ] + }, + "image": { + "output_format": "text", + "parse_method": "ocr", + "preprocess": "main_content", + "suffix": [ + "jpg", + "jpeg", + "png", + "gif" + ], + "system_prompt": "" + }, + "markdown": { + "output_format": "json", + "preprocess": "main_content", + "suffix": [ + "md", + "markdown", + "mdx" + ], + "vlm": {} + }, + "pdf": { + "output_format": "json", + "parse_method": "DeepDOC", + "preprocess": "main_content", + "suffix": [ + "pdf" + ], + "vlm": {} + }, + "slides": { + "output_format": "json", + "parse_method": "DeepDOC", + "preprocess": "main_content", + "suffix": [ + "pptx", + "ppt" + ] + }, + "spreadsheet": { + "output_format": "html", + "parse_method": "DeepDOC", + "preprocess": "main_content", + "suffix": [ + "xls", + "xlsx", + "csv" + ], + "vlm": {} + }, + "text&code": { + "output_format": "json", + "preprocess": "main_content", + "suffix": [ + "txt", + "py", + "js", + "java", + "c", + "cpp", + "h", + "php", + "go", + "ts", + "sh", + "cs", + "kt", + "sql" + ] + } + } + } + }, + "upstream": [ + "File" + ] + }, + "TokenChunker:LongRadiosNotice": { + "downstream": [ + "Tokenizer:SixtyChickenSell" + ], + "obj": { + "component_name": "TokenChunker", + "params": { + "children_delimiters": [], + "chunk_token_size": 512, + "delimiter_mode": "one", + "delimiters": [], + "image_context_size": 0, + "outputs": { + "chunks": { + "type": "Array", + "value": [] + } + }, + "overlapped_percent": 0, + "table_context_size": 0 + } + }, + "upstream": [ + "Parser:HipSignsRhyme" + ] + }, + "Tokenizer:SixtyChickenSell": { + "downstream": [], + "obj": { + "component_name": "Tokenizer", + "params": { + "fields": "text", + "filename_embd_weight": 0.1, + "outputs": {}, + "search_method": [ + "embedding", + "full_text" + ] + } + }, + "upstream": [ + "TokenChunker:LongRadiosNotice" + ] + } + }, + "globals": { + "sys.history": [] + }, + "graph": { + "edges": [ + { + "id": "xy-edge__Filestart-Parser:HipSignsRhymeend", + "source": "File", + "sourceHandle": "start", + "target": "Parser:HipSignsRhyme", + "targetHandle": "end" + }, + { + "id": "xy-edge__Parser:HipSignsRhymestart-TokenChunker:LongRadiosNoticeend", + "source": "Parser:HipSignsRhyme", + "sourceHandle": "start", + "target": "TokenChunker:LongRadiosNotice", + "targetHandle": "end" + }, + { + "id": "xy-edge__TokenChunker:LongRadiosNoticestart-Tokenizer:SixtyChickenSellend", + "source": "TokenChunker:LongRadiosNotice", + "sourceHandle": "start", + "target": "Tokenizer:SixtyChickenSell", + "targetHandle": "end" + } + ], + "nodes": [ + { + "data": { + "label": "File", + "name": "File" + }, + "id": "File", + "measured": { + "height": 49, + "width": 200 + }, + "position": { + "x": 50, + "y": 200 + }, + "sourcePosition": "left", + "targetPosition": "right", + "type": "beginNode" + }, + { + "data": { + "form": { + "outputs": { + "html": { + "type": "string", + "value": "" + }, + "json": { + "type": "Array", + "value": [] + }, + "markdown": { + "type": "string", + "value": "" + }, + "text": { + "type": "string", + "value": "" + } + }, + "setups": [ + { + "fileFormat": "pdf", + "output_format": "json", + "parse_method": "DeepDOC", + "preprocess": "main_content" + }, + { + "fileFormat": "spreadsheet", + "output_format": "html", + "parse_method": "DeepDOC", + "preprocess": "main_content" + }, + { + "fileFormat": "image", + "output_format": "text", + "parse_method": "ocr", + "preprocess": "main_content", + "system_prompt": "" + }, + { + "fields": [ + "from", + "to", + "cc", + "bcc", + "date", + "subject", + "body", + "attachments" + ], + "fileFormat": "email", + "output_format": "text", + "preprocess": "main_content" + }, + { + "fileFormat": "markdown", + "output_format": "json", + "preprocess": "main_content" + }, + { + "fileFormat": "text&code", + "output_format": "json", + "preprocess": "main_content" + }, + { + "fileFormat": "html", + "output_format": "json", + "preprocess": "main_content" + }, + { + "fileFormat": "doc", + "output_format": "json", + "preprocess": "main_content" + }, + { + "fileFormat": "docx", + "output_format": "json", + "preprocess": "main_content" + }, + { + "fileFormat": "slides", + "output_format": "json", + "parse_method": "DeepDOC", + "preprocess": "main_content" + } + ] + }, + "label": "Parser", + "name": "Parser_0" + }, + "dragging": false, + "id": "Parser:HipSignsRhyme", + "measured": { + "height": 197, + "width": 200 + }, + "position": { + "x": 316.99524094206413, + "y": 195.39629819663406 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "parserNode" + }, + { + "data": { + "form": { + "children_delimiters": [], + "chunk_token_size": 512, + "delimiter_mode": "one", + "delimiters": [ + { + "value": "\n" + } + ], + "image_table_context_window": 0, + "outputs": { + "chunks": { + "type": "Array", + "value": [] + } + }, + "overlapped_percent": 0 + }, + "label": "TokenChunker", + "name": "Token Chunker_0" + }, + "id": "TokenChunker:LongRadiosNotice", + "measured": { + "height": 73, + "width": 200 + }, + "position": { + "x": 616.9952409420641, + "y": 195.39629819663406 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "chunkerNode" + }, + { + "data": { + "form": { + "fields": "text", + "filename_embd_weight": 0.1, + "outputs": {}, + "search_method": [ + "embedding", + "full_text" + ] + }, + "label": "Tokenizer", + "name": "Indexer_0" + }, + "id": "Tokenizer:SixtyChickenSell", + "measured": { + "height": 113, + "width": 200 + }, + "position": { + "x": 916.9952409420641, + "y": 195.39629819663406 + }, + "sourcePosition": "right", + "targetPosition": "left", + "type": "tokenizerNode" + } + ] + }, + "history": [], + "messages": [], + "path": [], + "retrieval": [], + "variables": [] + }, + "avatar": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADAAAAAwCAYAAABXAvmHAAAACXBIWXMAABYlAAAWJQFJUiTwAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAABMaSURBVHgBbVprjF3Vdf72Puc+5s7Lnhm/x/ZgCHWAFLdVKEqogJY2ikIJEUSBhiZFIaVRKZj+KFHa1G7apBS1KlGpkh99EEUgykMiKlFpFCUIyS1pSTFCQLADHmNjjz3jec99nXP27rfW3ufOtZOxru/cc8/Zez2+9a3HHoMLfv7skjuuy/Li4wbmZn6c8t7rdWMMnHOw1kKuyWf56f9dfuSe8rO8y/fyjFyXn/7fy+/llSRJeNbyWW5pZQ2+jOzvcbiW1A67wv3FwelHp/vl7e18cOr3NrTQOcCb9/cLVQpcblIUxfrDZl0B2UffRSlVQp5z5ylXGqEnbFSg/935gsIn4bmoPC8i4T+qw//xMJ8URRZ7CqjwvvVDY5J9pcVE0J4l5Tbjoxg+COc8wtdiUW6a2AuEkZco7HrrqTAXeKp8uai0tWKsILzjq5Kk+rvhtcRENbw/nAPXixK6Yst1DvChfaUAsplYqTSzRxGFFeFsUEldbPW65b1BIYuAOLm5vGZU6XK9fkiWgpfXre69rlgqcKMXfbhB1ynkszH7qkgP6DpfnLptiiIc+xlXRryLRZwzUSgfrBE3UGGCsYKgIkwvNoJFXQ+O8nLhnd7xbh3Fcm+aprqn/B7ihEpH+NJMVM6qBxROGhv8nNrr6R97oMR4vwLBwtS4KGXyKoKJiyo+uai8yz9ZmDsHTIoBihA/Vr9yEXrBa/3wRLAvCnlAl0j5lqvysk4JOlnDYj3WqCJcbm5Oucw+9LmzDFoIbHxF9lPLCkxUEB/CIeDZB6bgPy6G7rJHa1UepZUq0BvTmkc3p7I0XrfNbQuiV7DM640hoFI3PYsHAnJBaBNgozKlia7psG7kQgnGfTylCL3ADewQAtCaCq3iVFi57n0ZxxSKgeW4YLaWYHm+wMQu4PIbEuy5po6pq6sY2lxFbUiYhJtXJOC76sW800a3VaC1lGH+WAevfx+YfsngnVcdOh1gYrMoI1okEQnB43mWhz0FlqqZwInXvZ9KSyT6qK08ldBcOYOpYgOG1SCBj8kKFcyeLDC2HfitL6b41Ts3YnCsyuclaMUTEqxpQIawTyYWr/P5nJasoz7kUB8BxnYWeN+vZ7ylzRubOPGywXNfK/Dysx5jVKQ2ZHpxkiZJdEjwgigpwSxBzyD+Hd+fUKzC2vQeEOsrZBKDueMeOy73+My/jGPylwaVZfIOnxNLwcYArfHBNCakAM3CdSKfFxSgUHgqMn0WmAti2Sbh1ETW9njij7p44Z8Mtl7MjQuD3NEDgkFCz0aCSAQRjBfzJ7tu8/38LIuLNW0P38TumlGP7P+Pcey+aphYLoKwIqCuWGbeCoUSxqgqjeqLMIKwmI15hEIrNjTKhXWohFKjC5jPm6gNLmPpdII//UAbI42UMPS9xBYBQcNW4POuGGZd+BC8gS4Fe0nF4MxJh6tur+Dvz+3C9l+s00KF4tBqLhAz2kiGRtlIc7FxCGGWR5bxwUO6vHjIqjVBkoCv8zKvoaoGSNJBenUbGmMpHjlXR23MIWtCvR1ShlcP5HmHy9DQJfbLNG5iGqd3cOJoji+9MIpbH96C9mre5yWjTlcLK3UmgszoCeFNocGcG7XVo/QhBchDHhBgm1TjAvSUsXUuMUAv1PhkVVJeVHgcnZUBHHi1hsYWmqMLJQUJYMn6gXByiYP1pJQkAcc2yXH6bY+HT2zC7g+O0OqZJI31hCWL9FQxIT0pZYQrmhlc4H7hdNqL75n+rt4RZeQVadkrxGp8Mg0wdMryvDSK9nJQIpfMzF0phvyvDAlFZn+1KfydOJx6exUPvTNAOqxrginpUwJTaBa+QAB/zF8avEmkuBjLNigCTUBiUXmGQUxKVY+YbogPUUrjqIjJi+ukkn8SVdRWGXOrA/jq0RSnT9gASGI/JUTSpBo8UP6kaQ0nj5/CAz8YwshWWi4vgj1NSaX0DuPTVEJG1LRvArVJUJbcDR/hHmsmX8a5QFUh5VRgR3ZSr/iuKufFMLoW73Oh9lRMpKM0bB33fzfFwntSQlQDCRRFTHpRifbaKq65NcHe62twmVjcoxfWQia1HN3/ZoJ7ja6uZqqQjym+lFxVtSYKG12EsqI1amWtVK1Rz3ifa5wInuV3Lx6PPtPiORQQvD6GvR9hsvwVq/obk4ZCslf703ULi0v43GMJceeDlWghNZaEaIMlwWND8Ic3onP8MnS+OwE7UESRgyAmVnaa9MI+uk6onxxClZwHWq1KVoSSRcgLwWCI0AtwsiGe4nveGcPvfstg4WQwqhBOWtJnc6GN2/+S6bnDJCG1h9zilkk0tUCRtEz97Xk8Pd3G8OvL+MgHh9Tlap+kiAFMjs9imS0QkDRQkYDshAAVtDDJoc79/nqNXmzAjo7x+Qz1O5cgpKV0K0nLB8MZG+GictYwfkkVu65sY+1cTPbhC2BluYXr/jBRo5sB1i4NWm6IbFOZ0x5AIr+5N8dH3RFcu4MiTS2jLIWz13IUR8gUL5HTc6+ZVZKUm8/R+ocM2ROb0f7HDXAzRulVY0NMVyO77RyDHaoBfQSgHtLE63WdkrYlfopsENfur6C1EAwfPZDgihtaqDQqrCgrWLyXbDTWgFmqo3LTGmrXnIVrT6DxKSaZD+/hQh0kO7h4h0ueoIBP8769m+iwFtIhMsQVThMP2ty8VdUaCBsp1/wqksk8JLm61yDUGHL1YEUtF6R65B0nqMQa8wLvSd/Hmollp2ed5fIBXPnJJTx2l0ddiEcy8dKcwyfvJKF1+fACrfpTJpbLKrDDQzDkYVTP8Muz3HAUlUlCB51o5Rqtx70Z3KgSTiMENStU1yT9DZP5zwSaLLot3joA985GZPMrsFe1lNkMS2tTSbRg1PpLvHaOBnl8mEmaMBwkfCfG0P7PWaQ3riK9lFDM2IvVath2aYHOWgzi1lqGi65OlGmKFaJqwCvmpW5xJ+vIn98KP78RrecXsPild9H+9jA6j04qBTYfYj0yxCTVIqZrxPvRceSPjCP7cYLOE23kSRN+cQEFGc4WTFJHqfFZ0uM4PdIlVbeoXB7YSj43H+Q64/x+iJ9ZelvmjfTGK+AObaMsbHZMoeX2jl9mvHZDjkGNTcXoFqkMCix9jiuPtAiZDvKVReQnV5AdGoQ7shHFu6TQt7jpm6S9I07rFzfIjZjn3fwZZEvzsFwLW1kWMJvaBkOSybForwlHI1+ch1tqo5ghxCq55kLDfZyGboLuczTGFqv5xxF+tVt4f5fB/cpRXk+QvUg4VmU/i0t+zbC8EQ8wG6YSTLSe9Jm+0QrU2WqH9q5FBWbm4Igiu0XqfRfoT8KnWEb3++eo5CotTAaiIvnyIpVaQcJEiPdzHSEFF5KTN02kt88i2dcOI5p2F4WYX5NhB8Xb9KRlXHQpzy8Q95u4z5UrVIJ7Dg+gOEuSyGSa0ozVrQmdh7R1RlxT5YXrFlH83wjSDXGKQLkGPk+B9tAb38tC+8fOyjpudsZg4sU65m+it5hDLC2bvr/A0KcyureKwU/XsPoAE15DWIV7NBh0O0KKwTDTVJUlA+PDJbEIIft4ZvmCcPT/W0XruYwC0rugTG+cRn6Ke5/LVTHJF1IBSAeNTsvp5MGteIwfaGHmY4Mx6TAWWBvVrpqhcTfpgwVd6+n2bG0O1VzqGEKCyhta1Fa5wYzIRwt1aZBkBe3DVHa8huomQmZMatY1LZ+zmSY6nO7U5hhJZ1mHfrTNzy0tVQRCtcmthNgy8qECFRF4fB5VNlF2AzSTv/tKjsqADhbYrGRJgIYMpyQ1DBKXnSxMJ5h4Cr/C9zmt31Uxuth3GaCLzBcp7006yn6av4tQWkhPI6ySWyojGUzY5mzIzKJ0MsHnzzaVJs0EKZuUbHc4rY/ABqbzkzkM3gts+HNS9LWDKN4cw+Bt/J6VsWTs4y8TahVSbMYFOq0EK7MOg6NpUEA5N9dJgF8KDbSrrdGSpL+V4ZBsBIPLRtO8tWSrLgMwJ94W6fRXKSyhVv8AP9cC3+fzC8wRG7D85RFSLK9/aE55X/KFXWshf9dj+L4aZm9hgtxtUB1vYPFuqVAp8KAQAjD72Rzj3yIdM1n+9MccJmyXYk/qbGaEk69IH+C0bExHwzApW1lCZdRg9cuXYvUru6kA+b8IjY2tMXxcNXQGjULHJfnCImG3htW/IX4fMRpi9YtZGHZYPmcM8MU5ZCcW0ZleQTrFpMkmqXN6Bu2ZU8jfY098kUXjrgbzRY5sgcw1wudG2wyNDtpvNVH/zdB9vPF8V9vMnHuS5XIMb/T40TMDSOqxq9oiKZva89UkPeYn1tB9nYsRw55VanduHt1zM+i8tSLRiIGbqsinGYz0QLG8yoXPoZhd1e5q8H4q80YHnRmuwXqrOMWAn2a9VR2Af69AdqqF7hnCrE64tXOMfmEQm57agOoV5HnXQpEwe+9rYvxpzpFuJ4ngFJ56IMGGTVYJJfVW23b86Nk67vrXeQqQYPR+BvJVzMZ7ibGBFNkiLXySw6h6Fdm79FLKYGMKrjJJOTYXw5/eSGWXsPJkUz1XmRrEwC0VrcEqmxrYfsig9b22xk5lNwP3QyPaju48MsEYypU65bNrGe17k51VjH+NuUTgIx2ddGhNBn97Ecff3IFT73SxeZsJY8t7dn/Cy3zl7HsGX32xicnLCRG2lORBrD7J4Fqp090G9RsYuCkZaI3ZsUFGkdGbVJorYT5qBya08gzkRmxnQrdhhqOznKrT6tUwi0pTZmUWKgFsE633VRiUg0YTSEEqPu3g5hW6tdEE909uRqPO/JE77Y1TmdNI1pzYZvHsQzX88dMdjlGUXTH0GVIgs7OXEqAttQvdNsDsm8vMk4tnTvldq8XOLJlEMtxGhBrRxQTldH6jVW457BWl8kJrYR9LZc0vvtRAUjQfKGaVIArOhqrDBZ55YAK+SdarlCP+MJlRHvekukPfSTB7rBYsJZVhu07XyVDT9pr28J1Z3zj0mvqbNi3ClcWMDqpCH1kJgwgbuiw5vNB3W1HYGCvUnMR2knRsyELFe3xmVntmR0Ml9N7xlzfh3/+WcbDBhi6tPLu4e/dv+3I2L8YsWN9849gSOsuxK4oNj5YVYSxRdvKhrkec8dv13tqqpcNMSEoOJ7WHvvrH7OGl4xcd+BZxkFaeIwSDWXqiubwV93CUObmHeMnXzxaw3tAEDJL2sTTfwTNfGUGN5XARRx9hcp2tN8cWsdU0ejpTCu/j8ZPTd6tDrzDxliZmVV9MHrxR3ltctxuE13lmGsUxITdYozmgm+/CfuaFyT0IBaI9/yTIlick2pny95Ex4Nt/1cUP/nmCg9jQcSHCpsQ14kymHK+XjZQJg1VVSC0kQy8bsjNC4RuGYOWYMHpOJ0s2Ch8hadjWrs7vwhcmOti2KxyIlOdz60dS8fzgvPEib9zGbuvv7l7DoSfHGPmpWlmtKzjtwcWFXjXOs1x5nGTCJFvrKHlOsRbGgWH86OKcqZxkmHh0EkoQySXV4XG8yqHBvbta2Dkl4/Vg5DDm9+cdT6nk/SePckPOVL2bDz702Sa++fvDqG8YZeYLxz/heChbh1KRx+MkH87ElP6CYOKR0u4hiONJjonHqAFwMW5yJIMkDTeJB1nrf/OODmFDGOcXHGn1/Zzngf6jVR2J88Ht2y0OkVbv2EoWeG0CtZEG+wYXBDExEJM45NVhfpxOqLfitDlu6uMYMdBuPIlh4EoZUx0awOyJSXzj1gYh08UCx/gbNzmNof7J4XnHX/Ga+fzOjx2jAlMXnpP1/yTc9PSMxIfFJ+6p4Dfu7HByF4a2RbfQDkkSlMjv4sFGSEZGPSOFn01lEhEn1aTOrF3F6TfqePmpKv7rcYf50w6bthht3vvZr1+W/jPr+PmwuXvqpocZTPdpIpEpQZ+25an6uubkjhWDRTb+W5mdJy9OcNm1wJ4rqdxmh6Ex1u41E6bIjqc8rJvaqyzaWJedOcoDErakM0cSHPsf/n7SabW7YQJ6jlYeHQHnH7/2jz4vPKLl50fNH0zeeB3d+sPeGXHfue3Pw14/A8hkrLPKcrwjR0m836WKnCSOyKUMlya8SkapsdlJGUfVmnRiYVyox6fW9M4lfCwngggmkFyk6PKwvPxrgbC/v0i/vWfq5od5NnBfUSYqH2ecfVr/vL9x6FkG0q8k8ShUYJ7G46UQJjaumejELfxuje0dbocgNnF+2r9vaPZ1vBgR0Qf1rz84/fh+BRu3O0gMH05NEvNgSfP+PDf2w6o/qGTzXIdUIQY4CeIJYhZO2TW7h3O03Mv1XGdCPl5DzOQuFnQlZNEL9aKnzPofkrjDdVQPanzKfy8t/qT94Y17/43pm1Mse3UYnOK8aD8/Ftb/aOPCv4HQQ27JpElpufL0PemdQ2sxF1mqnFgrw8bEmJh+9oFO8kIO0Cz99QHbvvPg9BPrf+zR/7N/6uap3JmD9N2VXHKfFnsuTJh7R1F97/0e6o+Tdag5hY4QRJpUNCsnNhyb2pgvUAQFbYn7+Hyyvs40vfYdDoyffXD6iRf69/p/CbMWUUVYM2EAAAAASUVORK5CYII=" +} diff --git a/agent/templates/ingestion_pipeline_Paper.json b/agent/templates/ingestion_pipeline_Paper.json new file mode 100644 index 0000000000..31c78067af --- /dev/null +++ b/agent/templates/ingestion_pipeline_Paper.json @@ -0,0 +1,555 @@ +{ + "id": 32, + "title": { + "en": "Paper", + "de": "Wissenschaftliche Arbeit", + "zh": "论文" + }, + "description": { + "en": "This template segments parsed files based on the structure of a paper. It is suitable for documents with clearly defined sections, such as academic papers, research articles, conference papers, and technical studies.", + "de": "Diese Vorlage segmentiert die geparste Datei anhand der Struktur einer wissenschaftlichen Arbeit. Sie eignet sich für Dokumente mit klar definierten Abschnitten, wie wissenschaftliche Aufsätze, Forschungsartikel, Konferenzbeiträge und technische Studien.", + "zh": "此模板将解析后的文件按论文结构进行切片,适用于具有清晰章节和学术结构的文档类型,如学术论文、研究文章、会议论文和技术研究报告。" + }, + "canvas_type": "Ingestion Pipeline", + "canvas_category": "dataflow_canvas", + "dsl": { + "components": { + "File": { + "downstream": [ + "Parser:HipSignsRhyme" + ], + "obj": { + "component_name": "File", + "params": {} + }, + "upstream": [] + }, + "Parser:HipSignsRhyme": { + "downstream": [ + "TitleChunker:BeigeSheepTravel" + ], + "obj": { + "component_name": "Parser", + "params": { + "outputs": { + "html": { + "type": "string", + "value": "" + }, + "json": { + "type": "Array", + "value": [] + }, + "markdown": { + "type": "string", + "value": "" + }, + "text": { + "type": "string", + "value": "" + } + }, + "setups": { + "doc": { + "output_format": "json", + "preprocess": "main_content", + "suffix": [ + "doc" + ] + }, + "docx": { + "output_format": "json", + "preprocess": "main_content", + "suffix": [ + "docx" + ], + "vlm": {} + }, + "email": { + "fields": [ + "from", + "to", + "cc", + "bcc", + "date", + "subject", + "body", + "attachments" + ], + "output_format": "text", + "preprocess": "main_content", + "suffix": [ + "eml", + "msg" + ] + }, + "html": { + "output_format": "json", + "preprocess": "main_content", + "suffix": [ + "htm", + "html" + ] + }, + "image": { + "output_format": "text", + "parse_method": "ocr", + "preprocess": "main_content", + "suffix": [ + "jpg", + "jpeg", + "png", + "gif" + ], + "system_prompt": "" + }, + "markdown": { + "output_format": "json", + "preprocess": "main_content", + "suffix": [ + "md", + "markdown", + "mdx" + ], + "vlm": {} + }, + "pdf": { + "output_format": "json", + "parse_method": "DeepDOC", + "preprocess": "main_content", + "suffix": [ + "pdf" + ], + "vlm": {} + }, + "slides": { + "output_format": "json", + "parse_method": "DeepDOC", + "preprocess": "main_content", + "suffix": [ + "pptx", + "ppt" + ] + }, + "spreadsheet": { + "output_format": "html", + "parse_method": "DeepDOC", + "preprocess": "main_content", + "suffix": [ + "xls", + "xlsx", + "csv" + ], + "vlm": {} + }, + "text&code": { + "output_format": "json", + "preprocess": "main_content", + "suffix": [ + "txt", + "py", + "js", + "java", + "c", + "cpp", + "h", + "php", + "go", + "ts", + "sh", + "cs", + "kt", + "sql" + ] + } + } + } + }, + "upstream": [ + "File" + ] + }, + "TitleChunker:BeigeSheepTravel": { + "downstream": [ + "Tokenizer:EveryBushesStop" + ], + "obj": { + "component_name": "TitleChunker", + "params": { + "hierarchy": 0, + "include_heading_content": false, + "levels": [ + [ + "^#[^#]", + "^##[^#]", + "^###[^#]", + "^####[^#]" + ], + [ + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+(\u5206?\u7f16|\u90e8\u5206)", + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u7ae0", + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u8282", + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u6761", + "[\\(\uff08][\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[\\)\uff09]" + ], + [ + "\u7b2c[0-9]+\u7ae0", + "\u7b2c[0-9]+\u8282", + "[0-9]{1,2}[\\. \u3001]", + "[0-9]{1,2}\\.[0-9]{1,2}($|[^a-zA-Z/%~.-])", + "[0-9]{1,2}\\.[0-9]{1,2}\\.[0-9]{1,2}" + ], + [ + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u7ae0", + "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u8282", + "[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[ \u3001]", + "[\\(\uff08][\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[\\)\uff09]", + "[\\(\uff08][0-9]{,2}[\\)\uff09]" + ], + [ + "PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)", + "Chapter (I+V?|VI*|XI|IX|X)", + "Section [0-9]+", + "Article [0-9]+" + ] + ], + "method": "group" + } + }, + "upstream": [ + "Parser:HipSignsRhyme" + ] + }, + "Tokenizer:EveryBushesStop": { + "downstream": [], + "obj": { + "component_name": "Tokenizer", + "params": { + "fields": "text", + "filename_embd_weight": 0.1, + "outputs": {}, + "search_method": [ + "embedding", + "full_text" + ] + } + }, + "upstream": [ + "TitleChunker:BeigeSheepTravel" + ] + } + }, + "globals": { + "sys.history": [] + }, + "graph": { + "edges": [ + { + "id": "xy-edge__Filestart-Parser:HipSignsRhymeend", + "source": "File", + "sourceHandle": "start", + "target": "Parser:HipSignsRhyme", + "targetHandle": "end" + }, + { + "data": { + "isHovered": false + }, + "id": "xy-edge__Parser:HipSignsRhymestart-TitleChunker:BeigeSheepTravelend", + "source": "Parser:HipSignsRhyme", + "sourceHandle": "start", + "target": "TitleChunker:BeigeSheepTravel", + "targetHandle": "end" + }, + { + "id": "xy-edge__TitleChunker:BeigeSheepTravelstart-Tokenizer:EveryBushesStopend", + "source": "TitleChunker:BeigeSheepTravel", + "sourceHandle": "start", + "target": "Tokenizer:EveryBushesStop", + "targetHandle": "end" + } + ], + "nodes": [ + { + "data": { + "label": "File", + "name": "File" + }, + "id": "File", + "measured": { + "height": 49, + "width": 200 + }, + "position": { + "x": 50, + "y": 200 + }, + "sourcePosition": "left", + "targetPosition": "right", + "type": "beginNode" + }, + { + "data": { + "form": { + "outputs": { + "html": { + "type": "string", + "value": "" + }, + "json": { + "type": "Array", + "value": [] + }, + "markdown": { + "type": "string", + "value": "" + }, + "text": { + "type": "string", + "value": "" + } + }, + "setups": [ + { + "fileFormat": "pdf", + "output_format": "json", + "parse_method": "DeepDOC", + "preprocess": "main_content" + }, + { + "fileFormat": "spreadsheet", + "output_format": "html", + "parse_method": "DeepDOC", + "preprocess": "main_content" + }, + { + "fileFormat": "image", + "output_format": "text", + "parse_method": "ocr", + "preprocess": "main_content", + "system_prompt": "" + }, + { + "fields": [ + "from", + "to", + "cc", + "bcc", + "date", + "subject", + "body", + "attachments" + ], + "fileFormat": "email", + "output_format": "text", + "preprocess": "main_content" + }, + { + "fileFormat": "markdown", + "output_format": "json", + "preprocess": "main_content" + }, + { + "fileFormat": "text&code", + "output_format": "json", + "preprocess": "main_content" + }, + { + "fileFormat": "html", + "output_format": "json", + "preprocess": "main_content" + }, + { + "fileFormat": "doc", + "output_format": "json", + "preprocess": "main_content" + }, + { + "fileFormat": "docx", + "output_format": "json", + "preprocess": "main_content" + }, + { + "fileFormat": "slides", + "output_format": "json", + "parse_method": "DeepDOC", + "preprocess": "main_content" + } + ] + }, + "label": "Parser", + "name": "Parser_0" + }, + "dragging": false, + "id": "Parser:HipSignsRhyme", + "measured": { + "height": 57, + "width": 200 + }, + "position": { + "x": 316.99524094206413, + "y": 195.39629819663406 + }, + "selected": false, + "sourcePosition": "right", + "targetPosition": "left", + "type": "parserNode" + }, + { + "data": { + "form": { + "hierarchy": "0", + "include_heading_content": false, + "method": "group", + "outputs": { + "chunks": { + "type": "Array", + "value": [] + } + }, + "rules": [ + { + "levels": [ + { + "expression": "^#[^#]" + }, + { + "expression": "^##[^#]" + }, + { + "expression": "^###[^#]" + }, + { + "expression": "^####[^#]" + } + ] + }, + { + "levels": [ + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+(\u5206?\u7f16|\u90e8\u5206)" + }, + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u7ae0" + }, + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u8282" + }, + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u6761" + }, + { + "expression": "[\\(\uff08][\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[\\)\uff09]" + } + ] + }, + { + "levels": [ + { + "expression": "\u7b2c[0-9]+\u7ae0" + }, + { + "expression": "\u7b2c[0-9]+\u8282" + }, + { + "expression": "[0-9]{1,2}[\\. \u3001]" + }, + { + "expression": "[0-9]{1,2}\\.[0-9]{1,2}($|[^a-zA-Z/%~.-])" + }, + { + "expression": "[0-9]{1,2}\\.[0-9]{1,2}\\.[0-9]{1,2}" + } + ] + }, + { + "levels": [ + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u7ae0" + }, + { + "expression": "\u7b2c[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e0-9]+\u8282" + }, + { + "expression": "[\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[ \u3001]" + }, + { + "expression": "[\\(\uff08][\u96f6\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e]+[\\)\uff09]" + }, + { + "expression": "[\\(\uff08][0-9]{,2}[\\)\uff09]" + } + ] + }, + { + "levels": [ + { + "expression": "PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)" + }, + { + "expression": "Chapter (I+V?|VI*|XI|IX|X)" + }, + { + "expression": "Section [0-9]+" + }, + { + "expression": "Article [0-9]+" + } + ] + } + ] + }, + "label": "TitleChunker", + "name": "Title Chunker_0" + }, + "id": "TitleChunker:BeigeSheepTravel", + "measured": { + "height": 73, + "width": 200 + }, + "position": { + "x": 616.9952409420641, + "y": 195.39629819663406 + }, + "selected": true, + "sourcePosition": "right", + "targetPosition": "left", + "type": "chunkerNode" + }, + { + "data": { + "form": { + "fields": "text", + "filename_embd_weight": 0.1, + "outputs": {}, + "search_method": [ + "embedding", + "full_text" + ] + }, + "label": "Tokenizer", + "name": "Indexer_0" + }, + "id": "Tokenizer:EveryBushesStop", + "measured": { + "height": 113, + "width": 200 + }, + "position": { + "x": 916.9952409420641, + "y": 195.39629819663406 + }, + "sourcePosition": "right", + "targetPosition": "left", + "type": "tokenizerNode" + } + ] + }, + "history": [], + "messages": [], + "path": [], + "retrieval": [], + "variables": [] + }, + "avatar": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADAAAAAwCAYAAABXAvmHAAAACXBIWXMAABYlAAAWJQFJUiTwAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAABMaSURBVHgBbVprjF3Vdf72Puc+5s7Lnhm/x/ZgCHWAFLdVKEqogJY2ikIJEUSBhiZFIaVRKZj+KFHa1G7apBS1KlGpkh99EEUgykMiKlFpFCUIyS1pSTFCQLADHmNjjz3jec99nXP27rfW3ufOtZOxru/cc8/Zez2+9a3HHoMLfv7skjuuy/Li4wbmZn6c8t7rdWMMnHOw1kKuyWf56f9dfuSe8rO8y/fyjFyXn/7fy+/llSRJeNbyWW5pZQ2+jOzvcbiW1A67wv3FwelHp/vl7e18cOr3NrTQOcCb9/cLVQpcblIUxfrDZl0B2UffRSlVQp5z5ylXGqEnbFSg/935gsIn4bmoPC8i4T+qw//xMJ8URRZ7CqjwvvVDY5J9pcVE0J4l5Tbjoxg+COc8wtdiUW6a2AuEkZco7HrrqTAXeKp8uai0tWKsILzjq5Kk+rvhtcRENbw/nAPXixK6Yst1DvChfaUAsplYqTSzRxGFFeFsUEldbPW65b1BIYuAOLm5vGZU6XK9fkiWgpfXre69rlgqcKMXfbhB1ynkszH7qkgP6DpfnLptiiIc+xlXRryLRZwzUSgfrBE3UGGCsYKgIkwvNoJFXQ+O8nLhnd7xbh3Fcm+aprqn/B7ihEpH+NJMVM6qBxROGhv8nNrr6R97oMR4vwLBwtS4KGXyKoKJiyo+uai8yz9ZmDsHTIoBihA/Vr9yEXrBa/3wRLAvCnlAl0j5lqvysk4JOlnDYj3WqCJcbm5Oucw+9LmzDFoIbHxF9lPLCkxUEB/CIeDZB6bgPy6G7rJHa1UepZUq0BvTmkc3p7I0XrfNbQuiV7DM640hoFI3PYsHAnJBaBNgozKlia7psG7kQgnGfTylCL3ADewQAtCaCq3iVFi57n0ZxxSKgeW4YLaWYHm+wMQu4PIbEuy5po6pq6sY2lxFbUiYhJtXJOC76sW800a3VaC1lGH+WAevfx+YfsngnVcdOh1gYrMoI1okEQnB43mWhz0FlqqZwInXvZ9KSyT6qK08ldBcOYOpYgOG1SCBj8kKFcyeLDC2HfitL6b41Ts3YnCsyuclaMUTEqxpQIawTyYWr/P5nJasoz7kUB8BxnYWeN+vZ7ylzRubOPGywXNfK/Dysx5jVKQ2ZHpxkiZJdEjwgigpwSxBzyD+Hd+fUKzC2vQeEOsrZBKDueMeOy73+My/jGPylwaVZfIOnxNLwcYArfHBNCakAM3CdSKfFxSgUHgqMn0WmAti2Sbh1ETW9njij7p44Z8Mtl7MjQuD3NEDgkFCz0aCSAQRjBfzJ7tu8/38LIuLNW0P38TumlGP7P+Pcey+aphYLoKwIqCuWGbeCoUSxqgqjeqLMIKwmI15hEIrNjTKhXWohFKjC5jPm6gNLmPpdII//UAbI42UMPS9xBYBQcNW4POuGGZd+BC8gS4Fe0nF4MxJh6tur+Dvz+3C9l+s00KF4tBqLhAz2kiGRtlIc7FxCGGWR5bxwUO6vHjIqjVBkoCv8zKvoaoGSNJBenUbGmMpHjlXR23MIWtCvR1ShlcP5HmHy9DQJfbLNG5iGqd3cOJoji+9MIpbH96C9mre5yWjTlcLK3UmgszoCeFNocGcG7XVo/QhBchDHhBgm1TjAvSUsXUuMUAv1PhkVVJeVHgcnZUBHHi1hsYWmqMLJQUJYMn6gXByiYP1pJQkAcc2yXH6bY+HT2zC7g+O0OqZJI31hCWL9FQxIT0pZYQrmhlc4H7hdNqL75n+rt4RZeQVadkrxGp8Mg0wdMryvDSK9nJQIpfMzF0phvyvDAlFZn+1KfydOJx6exUPvTNAOqxrginpUwJTaBa+QAB/zF8avEmkuBjLNigCTUBiUXmGQUxKVY+YbogPUUrjqIjJi+ukkn8SVdRWGXOrA/jq0RSnT9gASGI/JUTSpBo8UP6kaQ0nj5/CAz8YwshWWi4vgj1NSaX0DuPTVEJG1LRvArVJUJbcDR/hHmsmX8a5QFUh5VRgR3ZSr/iuKufFMLoW73Oh9lRMpKM0bB33fzfFwntSQlQDCRRFTHpRifbaKq65NcHe62twmVjcoxfWQia1HN3/ZoJ7ja6uZqqQjym+lFxVtSYKG12EsqI1amWtVK1Rz3ifa5wInuV3Lx6PPtPiORQQvD6GvR9hsvwVq/obk4ZCslf703ULi0v43GMJceeDlWghNZaEaIMlwWND8Ic3onP8MnS+OwE7UESRgyAmVnaa9MI+uk6onxxClZwHWq1KVoSSRcgLwWCI0AtwsiGe4nveGcPvfstg4WQwqhBOWtJnc6GN2/+S6bnDJCG1h9zilkk0tUCRtEz97Xk8Pd3G8OvL+MgHh9Tlap+kiAFMjs9imS0QkDRQkYDshAAVtDDJoc79/nqNXmzAjo7x+Qz1O5cgpKV0K0nLB8MZG+GictYwfkkVu65sY+1cTPbhC2BluYXr/jBRo5sB1i4NWm6IbFOZ0x5AIr+5N8dH3RFcu4MiTS2jLIWz13IUR8gUL5HTc6+ZVZKUm8/R+ocM2ROb0f7HDXAzRulVY0NMVyO77RyDHaoBfQSgHtLE63WdkrYlfopsENfur6C1EAwfPZDgihtaqDQqrCgrWLyXbDTWgFmqo3LTGmrXnIVrT6DxKSaZD+/hQh0kO7h4h0ueoIBP8769m+iwFtIhMsQVThMP2ty8VdUaCBsp1/wqksk8JLm61yDUGHL1YEUtF6R65B0nqMQa8wLvSd/Hmollp2ed5fIBXPnJJTx2l0ddiEcy8dKcwyfvJKF1+fACrfpTJpbLKrDDQzDkYVTP8Muz3HAUlUlCB51o5Rqtx70Z3KgSTiMENStU1yT9DZP5zwSaLLot3joA985GZPMrsFe1lNkMS2tTSbRg1PpLvHaOBnl8mEmaMBwkfCfG0P7PWaQ3riK9lFDM2IvVath2aYHOWgzi1lqGi65OlGmKFaJqwCvmpW5xJ+vIn98KP78RrecXsPild9H+9jA6j04qBTYfYj0yxCTVIqZrxPvRceSPjCP7cYLOE23kSRN+cQEFGc4WTFJHqfFZ0uM4PdIlVbeoXB7YSj43H+Q64/x+iJ9ZelvmjfTGK+AObaMsbHZMoeX2jl9mvHZDjkGNTcXoFqkMCix9jiuPtAiZDvKVReQnV5AdGoQ7shHFu6TQt7jpm6S9I07rFzfIjZjn3fwZZEvzsFwLW1kWMJvaBkOSybForwlHI1+ch1tqo5ghxCq55kLDfZyGboLuczTGFqv5xxF+tVt4f5fB/cpRXk+QvUg4VmU/i0t+zbC8EQ8wG6YSTLSe9Jm+0QrU2WqH9q5FBWbm4Igiu0XqfRfoT8KnWEb3++eo5CotTAaiIvnyIpVaQcJEiPdzHSEFF5KTN02kt88i2dcOI5p2F4WYX5NhB8Xb9KRlXHQpzy8Q95u4z5UrVIJ7Dg+gOEuSyGSa0ozVrQmdh7R1RlxT5YXrFlH83wjSDXGKQLkGPk+B9tAb38tC+8fOyjpudsZg4sU65m+it5hDLC2bvr/A0KcyureKwU/XsPoAE15DWIV7NBh0O0KKwTDTVJUlA+PDJbEIIft4ZvmCcPT/W0XruYwC0rugTG+cRn6Ke5/LVTHJF1IBSAeNTsvp5MGteIwfaGHmY4Mx6TAWWBvVrpqhcTfpgwVd6+n2bG0O1VzqGEKCyhta1Fa5wYzIRwt1aZBkBe3DVHa8huomQmZMatY1LZ+zmSY6nO7U5hhJZ1mHfrTNzy0tVQRCtcmthNgy8qECFRF4fB5VNlF2AzSTv/tKjsqADhbYrGRJgIYMpyQ1DBKXnSxMJ5h4Cr/C9zmt31Uxuth3GaCLzBcp7006yn6av4tQWkhPI6ySWyojGUzY5mzIzKJ0MsHnzzaVJs0EKZuUbHc4rY/ABqbzkzkM3gts+HNS9LWDKN4cw+Bt/J6VsWTs4y8TahVSbMYFOq0EK7MOg6NpUEA5N9dJgF8KDbSrrdGSpL+V4ZBsBIPLRtO8tWSrLgMwJ94W6fRXKSyhVv8AP9cC3+fzC8wRG7D85RFSLK9/aE55X/KFXWshf9dj+L4aZm9hgtxtUB1vYPFuqVAp8KAQAjD72Rzj3yIdM1n+9MccJmyXYk/qbGaEk69IH+C0bExHwzApW1lCZdRg9cuXYvUru6kA+b8IjY2tMXxcNXQGjULHJfnCImG3htW/IX4fMRpi9YtZGHZYPmcM8MU5ZCcW0ZleQTrFpMkmqXN6Bu2ZU8jfY098kUXjrgbzRY5sgcw1wudG2wyNDtpvNVH/zdB9vPF8V9vMnHuS5XIMb/T40TMDSOqxq9oiKZva89UkPeYn1tB9nYsRw55VanduHt1zM+i8tSLRiIGbqsinGYz0QLG8yoXPoZhd1e5q8H4q80YHnRmuwXqrOMWAn2a9VR2Af69AdqqF7hnCrE64tXOMfmEQm57agOoV5HnXQpEwe+9rYvxpzpFuJ4ngFJ56IMGGTVYJJfVW23b86Nk67vrXeQqQYPR+BvJVzMZ7ibGBFNkiLXySw6h6Fdm79FLKYGMKrjJJOTYXw5/eSGWXsPJkUz1XmRrEwC0VrcEqmxrYfsig9b22xk5lNwP3QyPaju48MsEYypU65bNrGe17k51VjH+NuUTgIx2ddGhNBn97Ecff3IFT73SxeZsJY8t7dn/Cy3zl7HsGX32xicnLCRG2lORBrD7J4Fqp090G9RsYuCkZaI3ZsUFGkdGbVJorYT5qBya08gzkRmxnQrdhhqOznKrT6tUwi0pTZmUWKgFsE633VRiUg0YTSEEqPu3g5hW6tdEE909uRqPO/JE77Y1TmdNI1pzYZvHsQzX88dMdjlGUXTH0GVIgs7OXEqAttQvdNsDsm8vMk4tnTvldq8XOLJlEMtxGhBrRxQTldH6jVW457BWl8kJrYR9LZc0vvtRAUjQfKGaVIArOhqrDBZ55YAK+SdarlCP+MJlRHvekukPfSTB7rBYsJZVhu07XyVDT9pr28J1Z3zj0mvqbNi3ClcWMDqpCH1kJgwgbuiw5vNB3W1HYGCvUnMR2knRsyELFe3xmVntmR0Ml9N7xlzfh3/+WcbDBhi6tPLu4e/dv+3I2L8YsWN9849gSOsuxK4oNj5YVYSxRdvKhrkec8dv13tqqpcNMSEoOJ7WHvvrH7OGl4xcd+BZxkFaeIwSDWXqiubwV93CUObmHeMnXzxaw3tAEDJL2sTTfwTNfGUGN5XARRx9hcp2tN8cWsdU0ejpTCu/j8ZPTd6tDrzDxliZmVV9MHrxR3ltctxuE13lmGsUxITdYozmgm+/CfuaFyT0IBaI9/yTIlick2pny95Ex4Nt/1cUP/nmCg9jQcSHCpsQ14kymHK+XjZQJg1VVSC0kQy8bsjNC4RuGYOWYMHpOJ0s2Ch8hadjWrs7vwhcmOti2KxyIlOdz60dS8fzgvPEib9zGbuvv7l7DoSfHGPmpWlmtKzjtwcWFXjXOs1x5nGTCJFvrKHlOsRbGgWH86OKcqZxkmHh0EkoQySXV4XG8yqHBvbta2Dkl4/Vg5DDm9+cdT6nk/SePckPOVL2bDz702Sa++fvDqG8YZeYLxz/heChbh1KRx+MkH87ElP6CYOKR0u4hiONJjonHqAFwMW5yJIMkDTeJB1nrf/OODmFDGOcXHGn1/Zzngf6jVR2J88Ht2y0OkVbv2EoWeG0CtZEG+wYXBDExEJM45NVhfpxOqLfitDlu6uMYMdBuPIlh4EoZUx0awOyJSXzj1gYh08UCx/gbNzmNof7J4XnHX/Ga+fzOjx2jAlMXnpP1/yTc9PSMxIfFJ+6p4Dfu7HByF4a2RbfQDkkSlMjv4sFGSEZGPSOFn01lEhEn1aTOrF3F6TfqePmpKv7rcYf50w6bthht3vvZr1+W/jPr+PmwuXvqpocZTPdpIpEpQZ+25an6uubkjhWDRTb+W5mdJy9OcNm1wJ4rqdxmh6Ex1u41E6bIjqc8rJvaqyzaWJedOcoDErakM0cSHPsf/n7SabW7YQJ6jlYeHQHnH7/2jz4vPKLl50fNH0zeeB3d+sPeGXHfue3Pw14/A8hkrLPKcrwjR0m836WKnCSOyKUMlya8SkapsdlJGUfVmnRiYVyox6fW9M4lfCwngggmkFyk6PKwvPxrgbC/v0i/vWfq5od5NnBfUSYqH2ecfVr/vL9x6FkG0q8k8ShUYJ7G46UQJjaumejELfxuje0dbocgNnF+2r9vaPZ1vBgR0Qf1rz84/fh+BRu3O0gMH05NEvNgSfP+PDf2w6o/qGTzXIdUIQY4CeIJYhZO2TW7h3O03Mv1XGdCPl5DzOQuFnQlZNEL9aKnzPofkrjDdVQPanzKfy8t/qT94Y17/43pm1Mse3UYnOK8aD8/Ftb/aOPCv4HQQ27JpElpufL0PemdQ2sxF1mqnFgrw8bEmJh+9oFO8kIO0Cz99QHbvvPg9BPrf+zR/7N/6uap3JmD9N2VXHKfFnsuTJh7R1F97/0e6o+Tdag5hY4QRJpUNCsnNhyb2pgvUAQFbYn7+Hyyvs40vfYdDoyffXD6iRf69/p/CbMWUUVYM2EAAAAASUVORK5CYII=" +}