diff --git a/agent/canvas.py b/agent/canvas.py index 15983024b9..65303ca9e9 100644 --- a/agent/canvas.py +++ b/agent/canvas.py @@ -29,6 +29,7 @@ from typing import Any, Union, Tuple from agent.component import component_class from agent.component.base import ComponentBase +from agent.dsl_migration import normalize_chunker_dsl from api.db.services.file_service import FileService from api.db.services.llm_service import LLMBundle from api.db.services.task_service import has_canceled @@ -84,7 +85,8 @@ class Graph: self.path = [] self.components = {} self.error = "" - self.dsl = json.loads(dsl) + # Accept legacy DSL on read, but keep the in-memory canvas in the latest schema. + self.dsl = normalize_chunker_dsl(json.loads(dsl)) self._tenant_id = tenant_id self.task_id = task_id if task_id else get_uuid() self.custom_header = custom_header diff --git a/agent/dsl_migration.py b/agent/dsl_migration.py new file mode 100644 index 0000000000..6fef629376 --- /dev/null +++ b/agent/dsl_migration.py @@ -0,0 +1,177 @@ +# +# Copyright 2026 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import copy +import re + + +# Keep all legacy chunker renames in one place so the migration rule stays readable. +COMPONENT_RENAMES = { + "Splitter": "TokenChunker", + "HierarchicalMerger": "TitleChunker", +} + +NODE_TYPE_RENAMES = { + "splitterNode": "chunkerNode", +} + +VARIABLE_REF_PATTERN = re.compile(r"(\{+\s*)([A-Za-z0-9:_-]+)(@[A-Za-z0-9_.-]+)(\s*\}+)") + + +def normalize_chunker_dsl(dsl: dict) -> dict: + """ + Rewrite legacy chunker component names and ids into the current DSL schema. + + This is intentionally a pure migration step: + - it does not change business params + - it only rewrites structural identifiers used by the canvas/runtime + - custom human-authored names are preserved unless they are still the exact + built-in legacy operator name + """ + if not isinstance(dsl, dict): + return dsl + + normalized = copy.deepcopy(dsl) + components = normalized.get("components") + if not isinstance(components, dict): + return normalized + + component_id_map: dict[str, str] = {} + for component_id in components.keys(): + new_component_id = component_id + for old_name, new_name in COMPONENT_RENAMES.items(): + prefix = f"{old_name}:" + if component_id.startswith(prefix): + new_component_id = f"{new_name}:{component_id[len(prefix):]}" + break + component_id_map[component_id] = new_component_id + + def rewrite_variable_refs(text: str) -> str: + if text in component_id_map: + return component_id_map[text] + + def repl(match: re.Match[str]) -> str: + component_id = match.group(2) + return ( + match.group(1) + + component_id_map.get(component_id, component_id) + + match.group(3) + + match.group(4) + ) + + return VARIABLE_REF_PATTERN.sub(repl, text) + + def rewrite_value(value): + if isinstance(value, str): + return rewrite_variable_refs(value) + if isinstance(value, list): + return [rewrite_value(item) for item in value] + if isinstance(value, dict): + return {key: rewrite_value(item) for key, item in value.items()} + return value + + rewritten_components = {} + for old_component_id, component in components.items(): + new_component_id = component_id_map[old_component_id] + new_component = rewrite_value(component) + + if isinstance(new_component, dict): + obj = new_component.get("obj") + if isinstance(obj, dict): + component_name = obj.get("component_name") + obj["component_name"] = COMPONENT_RENAMES.get(component_name, component_name) + + if isinstance(new_component.get("downstream"), list): + new_component["downstream"] = [ + component_id_map.get(component_id, component_id) + for component_id in new_component["downstream"] + ] + if isinstance(new_component.get("upstream"), list): + new_component["upstream"] = [ + component_id_map.get(component_id, component_id) + for component_id in new_component["upstream"] + ] + + parent_id = new_component.get("parent_id") + if isinstance(parent_id, str): + new_component["parent_id"] = component_id_map.get(parent_id, parent_id) + + rewritten_components[new_component_id] = new_component + + normalized["components"] = rewritten_components + + if isinstance(normalized.get("path"), list): + normalized["path"] = [ + component_id_map.get(component_id, component_id) + for component_id in normalized["path"] + ] + + graph = normalized.get("graph") + if isinstance(graph, dict): + nodes = graph.get("nodes") + if isinstance(nodes, list): + for node in nodes: + if not isinstance(node, dict): + continue + node_id = node.get("id") + if isinstance(node_id, str): + node["id"] = component_id_map.get(node_id, node_id) + + parent_id = node.get("parentId") + if isinstance(parent_id, str): + node["parentId"] = component_id_map.get(parent_id, parent_id) + + node_type = node.get("type") + if isinstance(node_type, str): + node["type"] = NODE_TYPE_RENAMES.get(node_type, node_type) + + data = node.get("data") + if not isinstance(data, dict): + continue + + label = data.get("label") + if isinstance(label, str): + data["label"] = COMPONENT_RENAMES.get(label, label) + + name = data.get("name") + if isinstance(name, str) and name in COMPONENT_RENAMES: + data["name"] = COMPONENT_RENAMES[name] + + if "form" in data: + data["form"] = rewrite_value(data["form"]) + + edges = graph.get("edges") + if isinstance(edges, list): + replacements = sorted(component_id_map.items(), key=lambda item: len(item[0]), reverse=True) + for edge in edges: + if not isinstance(edge, dict): + continue + for key in ("source", "target"): + value = edge.get(key) + if isinstance(value, str): + edge[key] = component_id_map.get(value, value) + + edge_id = edge.get("id") + if isinstance(edge_id, str): + for old_component_id, new_component_id in replacements: + edge_id = edge_id.replace(old_component_id, new_component_id) + edge["id"] = edge_id + + for key in ("history", "messages", "reference"): + if key in normalized: + normalized[key] = rewrite_value(normalized[key]) + + return normalized diff --git a/agent/templates/advanced_ingestion_pipeline.json b/agent/templates/advanced_ingestion_pipeline.json index 97a4c22105..08066f0b1c 100644 --- a/agent/templates/advanced_ingestion_pipeline.json +++ b/agent/templates/advanced_ingestion_pipeline.json @@ -115,15 +115,15 @@ } }, "downstream": [ - "Splitter:KindDingosJam" + "TokenChunker:KindDingosJam" ], "upstream": [ "File" ] }, - "Splitter:KindDingosJam": { + "TokenChunker:KindDingosJam": { "obj": { - "component_name": "Splitter", + "component_name": "TokenChunker", "params": { "chunk_token_size": 512, "delimiters": [ @@ -160,7 +160,7 @@ "presence_penalty": 0.4, "prompts": [ { - "content": "Text to Summarize:\n{Splitter:KindDingosJam@chunks}", + "content": "Text to Summarize:\n{TokenChunker:KindDingosJam@chunks}", "role": "user" } ], @@ -175,7 +175,7 @@ "Extractor:TastyPointsLay" ], "upstream": [ - "Splitter:KindDingosJam" + "TokenChunker:KindDingosJam" ] }, "Extractor:TastyPointsLay": { @@ -418,11 +418,11 @@ }, "overlapped_percent": 0.2 }, - "label": "Splitter", + "label": "TokenChunker", "name": "Token Chunker" }, "dragging": false, - "id": "Splitter:KindDingosJam", + "id": "TokenChunker:KindDingosJam", "measured": { "height": 80, "width": 200 @@ -434,7 +434,7 @@ "selected": false, "sourcePosition": "right", "targetPosition": "left", - "type": "splitterNode" + "type": "chunkerNode" }, { "data": { @@ -448,7 +448,7 @@ "outputs": {}, "presencePenaltyEnabled": false, "presence_penalty": 0.4, - "prompts": "Text to Summarize:\n{Splitter:KindDingosJam@chunks}", + "prompts": "Text to Summarize:\n{TokenChunker:KindDingosJam@chunks}", "sys_prompt": "Act as a precise summarizer. Your task is to create a summary of the provided content that is both concise and faithful to the original.\n\nKey Instructions:\n1. Accuracy: Strictly base the summary on the information given. Do not introduce any new facts, conclusions, or interpretations that are not explicitly stated.\n2. Language: Write the summary in the same language as the source text.\n3. Objectivity: Present the key points without bias, preserving the original intent and tone of the content. Do not editorialize.\n4. Conciseness: Focus on the most important ideas, omitting minor details and fluff.", "temperature": 0.1, "temperatureEnabled": false, @@ -654,8 +654,8 @@ "data": { "isHovered": false }, - "id": "xy-edge__Splitter:KindDingosJamstart-Extractor:NineTiesSinend", - "source": "Splitter:KindDingosJam", + "id": "xy-edge__TokenChunker:KindDingosJamstart-Extractor:NineTiesSinend", + "source": "TokenChunker:KindDingosJam", "sourceHandle": "start", "target": "Extractor:NineTiesSin", "targetHandle": "end" @@ -704,7 +704,7 @@ "data": { "isHovered": false }, - "id": "xy-edge__Parser:HipSignsRhymestart-Splitter:KindDingosJamend", + "id": "xy-edge__Parser:HipSignsRhymestart-TokenChunker:KindDingosJamend", "markerEnd": "logo", "source": "Parser:HipSignsRhyme", "sourceHandle": "start", @@ -712,7 +712,7 @@ "stroke": "rgba(91, 93, 106, 1)", "strokeWidth": 1 }, - "target": "Splitter:KindDingosJam", + "target": "TokenChunker:KindDingosJam", "targetHandle": "end", "type": "buttonEdge", "zIndex": 1001 @@ -725,4 +725,4 @@ "retrieval": [] }, "avatar": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADAAAAAwCAYAAABXAvmHAAAACXBIWXMAABYlAAAWJQFJUiTwAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAABpQSURBVHgBbXoJfFTluf5zzpw5syaZZLKRjZCEBEIgLAFBLYIW0apsUnFprUu1t3q1WqWl2Nvi36X6v7aIqNV760WttVotFVBZROAiEowIYQ1bQvY9mcxk9jlzzn2+E7Cx7eSX32xneb93ed7nfb6R8A+PVWvfm3ek4exih01ZAkjFsiShu7cf40uKcM9ty5DuSUEyqSMS03i0AYfNCsOA+fofH+LceEJDXNNhtylQZRmlbh1/6gHur3Mi3KgAHkCxAZqb1/PzvYXniasN6TCcwPIpftzvTdaHNLneZZEfnz8tvXn0PaSLL9as/Zvnk7qDv7Zb5YfSUlPMzwZ8fqS4nXjgrpsxc1oJurqDpkHCVJtVMc+OxTU+SVBVC3R9ZBESDReL1HQdikWGhf/FqoavYjpu2ZuKzrMKUjOA66/fi+BgLjbvKQdSdUg0XgrJ0LmWmpIgXqwII0WXYbVznW4rBjULWgLS82FH4vGl49KHvl7AvCUPeeJWdXd2tneqsC4cjWB4OIw7br4BixZeytcxRKJxGmTASmNketIw/m6seB1PJGFVaCy/i3KR4lkcN0bVEVcN3LLPir1nHYAziTG5Cn660MDjoWswxTseLetfRDeNT9Ka7JQoXpgSwFyngR4u5C/tKXir2YGOuASFp8u8nmzI9WEd83GnNGQu4OoHXlzrjvQ95EgOYyiiY/Ilc3D/HUu4FhlDgTCN12iMBKfd+rWXv/mQ+D3M45L0usOhIoXvc91JrGxQ8OIBJ6QgF58rIz+X6ZGWQCEXMS/Hj81bHDhySoVaoOGXU/24JU2HapWx/pQdvz3kRtZkA3fNiGKsR0ePy4FlvNFW3lEJRZ5/xO18WLJv8BVrcJ7XArxjMIiaTA1/+q4XhRjCSZ8ECz2sWkV6gLksPCsxCnw/KgIa00XnAVbFAoXflzo0vDkAPPhpKsLDMiR6zghrcOQr8DCvbTQy7Ad6j/KeKQZuqvbjiaI4OkMKitN1fHtzOhodFjywPI4qScNZw4pImhV3PPU31Gyvg89hx2sbfoLTuZ750sy1H21w2K13LJxbDU9RJnY06di0XcFUFtVHy8MwkqwFhk++mNs0VNNGvCQ8H9eSNJqv+V/m1HEiYWD5p3a09togpzMHBhTcXJlATZ4VH7bo2NXBnGa6JMIGqseG8PKEEJS4bKbIVSVcwB/S4Jss4en5UVgGk+gc48Llnx7Hktt/g2S/D4mqCTDOdSAlNIiz9928Tspoe+zwOE/hVCYqdqYvw2c+F1rtUTx9xInOLTL+dmsYc3M1tIe4CAlmwXIdCMXiJvA4bCqyrXxh0/H9Qyp2HHBAUnRY0mTYmW4nFvtQVOBBPCmDdY6T24GZ53X8flYfZtPLp8My0l0WXF4kYewrbrSWyXhkbgzpEQNX5Ngx+81dkO97BeHSPCAQgTKzDIna02bdWQP+ZnlI8k09G2xCn94Mb9dDOIZe9PpUrCECFN+rY+k7TmxsVpDnNEyPJ+hxUbAOVYWXuV7miuO5Lub766nYsceBBTURTBmTgOYz8Na1YYQkG+IGC5uRDEQNVC4EGs61IyvDik5DQVmOFVVZEu7fy6jZZMy+VMMsJYGITcKvB5KQHqbxHhcwFGTYElAWTIGcYodMoNBUe7H8Yfb9KLB60BruY157sCH5Borp1c6EgsecEaT90MA9G5046Seq6LwAkUbk+oQUHV9GdWS/n4r/3JUCKTWJVfcCp664CY7vPQIXDYi9k2QNKYi2EHyGgba6YWy4cRP0tx5DTmEG8jwKEqytYWbay/tUpN5g4AZZx511brz4Y8Lq1A5I6Q6inx3JuBVJxYXg6vdopw0J2KA7PLDsedS7JkDDch1p0JjK3dIJxNvnIZLuQr7LwLdsCWz3WvHOByp+My+BFKbPkEXHPHr71d1OMOWRyfCXVQCdxMHFhJnI8bk41piFnlWbMGNgEh5bvxq3PrwA7d1NaNryEfKWXIn078xCIhBDPov6V1/ZcchtQaJUwqGfJHDL6q+w4FiUKepDjfskOqMWdIfjCDpt6A0m0D8UhjyvEtq0cbAEV1yyJk9OQ2NvADkJD7xyDiJtDeieXY03vHZIBVZUlmg40WRB0JfEH70qHnzbCR8RQ04H0hgJm1uG0yrBn4jj021lOHogB94ZSVzPIiyak45ZN3rQuG8Q3YeAaMUw7vx4PSw9cdGqkWFJ4L5tLoRvl7BwcRcWbT2HbiLg5wTLNuUI5iQTrD8N5U8th1xTita6VoQdDsTzvLCW5kDyfPRzwykxH7uCSEt1wGF3YOn8ctz46BwceCWKTUU2fLl2LKQZdMjrOlauOYxX/rsKId1GSAWsRCVnKptfREZ4iJ1DMfBQtQ/3ZybRTIiMhqMo92ZDdzONND+amwYxFPegcmwGAsGYCbuTd6ZiWkM3rvttJ7bImxHVh6HAhWtvm457ssKo3XQKfd1DKCr2oul0P/sM6QuRMBqLQrEft2DlNVcyZRJ4/WgtBvQQDv3gHLb+tQ7lSiUq20qx5EYDv24uhDbHgq7ibDzy4+N4/L1q6F0MO+F0gKhiGMTvmgB+VxRjsSpoVW0YbO6Bt0DFF0fPIcTWyazG9o1voLLIiyn//lOkZ6o4OSiZaPad33bjbcvrSEtmwIlUsO8jSb610e+AlcFqjBL9+D/gYFNk3QRZxIXeDEhF33/MsFmt8BY5cOpgHyZPz0Lw/xH/mefWpIsXiiLPUoopzho8Wz8BEx5owdiPP8CpFfPRevUUGGxYxUVhvFoVRB7RppmYnpvGhkXy9qvCKCY5PVjNJjkYjrHhGTh1thHDwTDSvUlseZpXrynHBl82ap5+DD6oGKFyIyRHMzTE+GxnylxT6kSsL8jGGMOgnkRVSQYOikabaVyxJnYuAa4BatgC92YZMWKeypgnmYsJXjZuWCDH+6H4y9BcxoLddwbNegqMRWPxuyl9eLKIeRogJ7JbUEyvOtIlHHghgjMth9HRH0ORJQvFC9jBeZPSwmx4Ur2oGJePk5t0dMlWHPBKyPnkA/rc8002yz9WGoo9Ko62RxCmA149+RBO1XVje/0QnIK5FmWlID/HiZtn1eDfLr8UA4NhhojECXb63ocgS4pNHg1oQkFtN8IZNvi5rJv+fwWOVvZjgqygWVNQUeBAVqqKYIT4HDfIThV4b2nHwgdd6Not6tUwO3g4rsPFBf7urtM49VYBblu3HfrsFMSduZDkf+ZZ4pMgOdbciR4U5KXiscV/Ri9Z8fRxbuTSFjl+mkStXcKZXV04Wd8Fm2I1Twyjj/4ooVfCXMZZpEte1LsPIYdN5Y3+H2BFiR1nhi1sSAoK2JSS7LqC6NlpeJzEbepPZfQ9uwgnD7ehYEM7GaRi8idFFD69/smeZuRW7EV1/A+oWDmMw2+tJHU+byLT6AedjPPDUaSRTlfl2ZGd60ax14bLJzOFWvyQU6fZ4Kl2oMfhR2NLDxuG/nUADTaLbFQjKg/DafSi64dzUbPTh0/PB9nQgLJcG/m+4EPfJHZiDpAJs98/SFq8qBoWLYS/7T0Kj0NBU6cPv9+wA6+fnI87Dl2OprW/QM0Xh1H5bgx733seDr0XVsZ99ELSGeW36zqwaEk5Vqd2476bxqHuBLODjECed1UZ8qalIPNKF/oOhtk55Qs1JJlRUNkXZuhX48Tcy2Fz5eLKQ61YXJMNN9v+cDhhelW+cLPRw06UYS/JzkRH5SmMzy9FiTsTqdIGwm8vfrBiATraEwgO6Ch44DtocJ3H8ne7seRXPtR+/CLOLfs2VKKDHf1cRwK8CSJkwm8f6IO1bheije147VSfuTApU3vEuNVyGU5UnUbfSTJA48LKJUHadLj1NhxfvBShe6/HXdftRt5VY7BgVRHSqgmLMdIT5rUwXFBsk3aPGi8FSw0Sfc4NdbPGKhFiQ/JHIugNMJWUkftk5jqw9nsH8Mn7TaiUilCt52HjwjQcW5KDaFcL0vbVIauhEakDPvTFA7hh9mTsZjNzWKKIkD1IM431RtVfM/D58mNwy5zdiOdWI2SGsXN8FZrW3I7yWhWLX6wl9FWjgxeyEutXtZShuzlqzgEq8ymRTJp2jx4tTQOzHfj9bUew//QRyGQHcy6dittemIBYIGlGWbBKEcBrx76BchSQqqSC4wMuMUpQ6+lEYOpUtE60wUeAiitJJDkZJsmzxFEKGbJ0HVYbHYRLq+pgi07BUEkheucQ36uqUHxWxtWvNmBMzIZuwuaA0YMMHuctt+LOI2Mw3Jo0511hhMh/kU6JC/OBIH1iIYY9iUidDR8938aRUMbilUWwTYoh5E+aBTox34rjtRrWL2xDZtwN0e6OkxPzTKTQzBKpAL0Gew9yze/6yZbHgwiII3yeBCnnl1EjwanFzYxTWTsZrSFUfOVDdiuRg3KBx12ChqE2/PzDcTj+YQh2zor7/urDI+fGYqAt8U+wJ+pBzM6xWMKcn+NkiFGOlenVRCEW+9CxJGyTE5iSLuMo8eKe92KwPOXD4s4YfBYbmycHd/51op3mGkhnX7bT2yKmCvNC4V8cMbY8FSFmibL4yb0MnVjRUZRiItq4wgA7QEraRGJ6GAeHGvHS0SlYf3szrr03C80HImaOs3xN2eTiaGkaL1CINSGmNjdnhQEySG2QPaGIlDnA/sBDx5TyPH5/w5c27KrnkLR3APptaXCeSWDapmZ8yX4jC5VD0GVzCRYzGuI1k4emjzw0WiDmRHmIB+xj0KJcZx1D18nG1YsOdr0o2uJ+/PvaEjx73XmcqA/RbgkOj0gZySz2KCWVv6OQYaoRuIBCYlkC0VQy1bhFo9/Y/jnkvxxg09vqwd5aO6oK6QDmkdKroXaGG/VpjbAzClHT0ISpaoRptmSKJ9IFbJRGvafTwgwDfQM2aq7Hwj/FDBUbJ8aVOdCwPYTGthApggPj5zux+91BOEU3oreFWCUsDQpP0/MONjFxU7Eokf8p/N5iY1NLZbpoEopqXVj3uZNDvYxXfgH88DYV117lghZiLvUm0VGQj5ARwKKnZ2Lidfno1wPkQpwDpJhp2T8+xGirSJDwrx6m1jNInJ5iw8tPVCKFMshrS9vRTaqweEcuwj1itBxhoS6miyheIXJZhfe5KGISShw6WuwSLvncgTOH6ZhsGSXTOD/k0WHEcC+P6ajI5BjHmdSvY/ZcD258exlpdhKli/Ix9cdlOPzCOZzZ3wl/MMQ6tV2Igrg+0c/CPnA9NhkXFyFWJLKu1xjEPMcVGIqGGBkDk9noPjzUi6t/lo0Vz+TAoPGBoGYqExdFLhMSzTSicEWYczok3H6ESsRBzq9chKpoSC9QwKEKzowkjyPVOCBhiJmZVqnj0axuVNoVtPclUZ5vQ3GWii6fhpCdvYRjXx+7f6A5QqeyWTIY6cVOeCvckObjXUOsShhup/eyyzyoeqIQyivZOLmzWWQisq9zYMWfc0EGjH5OUjamhlDhkqPw3uzAdEBJmoEnTgNP1nNWjDHVqEaIhidIvI1YnpnD+aGfw0gfCSMHoQerfPiRN4oGvwrO/KguspOecHJLGOawYyqYrKVwVDNLN4MKhlPIG5ooa6bQt56dYobdzVV7Sh3IyLIhjfJf58c6jhVacPO2fEwe74CvLQYhRrpcKuJMlRhvoCojApcwfgKnr61UImZudNFTMmxMlxhh58qxUTwzmwoGi/n1Yxqe2MiDOfteM2MYz+SH2ZnJj+J25PN4j1OB0Iy1xIhjBC1TSKHzHVGcDFqwpd2NfZ2Umig4iGwZY+eIdKyT1FEgCQ9WWRE6T0gytweZv1lkfUY3wxhLmjThog5qNi1CYYzHlXHK72ESLieqHKdEaEljGtJbBnP84aoYnq1JwEh1E7Fgpk/b/xr4jCPjJG8YXUkVOakKHSYLh5p1JKIqHCPR++XuBN46o+CRfS4MM5UmjU9i0hgdXgpoYL/oZPpJx1oCxr9sRppmGijkc/F+ND0QofNaDAgR+646Ge+fcZAuizpiuognRkgllvU9GEDvgA1jGdWk8Cb5vtQq4fsfb8Uvr/42LGrMLPbR1xb3kpMaXKQNc9/zoJmOu3dxHDO9GvqpXCsJlZRHxnnlPB7ISGHHp5pwMWQXPSzQRKjLLtLf6IXXF3Oeh6OMUvjzbTJW/8WFhJ+5nMlruBj7OMycB8+z+UP4U4UfNx/Ox6dPtcOVA+x4fytecX6K6t9/GwX0YFsS/6Ryq9TWA5TUJ77sgftaA+unh6EFmZIh6kPsUHusn+GAvh9+TojP9boJENm6iUFi5QkiiMBzq3IBz/WRwhbfiw2N8S4dZMEYs9mFlW+loCpfxrXTOWnRcGmY1SZykp6WeUyYw/qfz32G8DoZ3/vNtzD3nkKo5D3xD/14UrsaLbGQyWJHlO2RexucdQ3ee8Z/pcK+Avjt5Ch6KaA7dRXjMo7gV/pqbA9vhT8axFrPXVTqXVA8Fhf6SVOFsaL9C4cYo3ZbNH5QSHoYYAQu+0zBoWa7qVDPuwm4ezoEY0fXm0B9gJ2XvCdJplE0No71i8JoMi6B9CMdO+dvw+ZX92HH9l14+Z3bYItnQFFls5uHo4kRR9lVjKeE7n2eMuL1wC/zoxhmPY7h+CkA44qu9XBxtr4urRLjLBl4L3QM2bZUyL3BYXqZ0MQLCCogeIyZ8zQ+lXlelpYkcaNq/GoKDp9yIsPLIbuGfYD6D/VXxHqBoy30JKVDGwXe564YwI5Jg7D20aD7ktjbcI4ClQPfeeYW7Gx/E2cpMyoWMlECBhVUjpcy8rLTkGON4e49doTzJCybGoeXmyJvDjnx6Os2vLKqj2GPI6xH0WMJ4LNEE/bHz6KHEpCcGFBGOhhG5llhuMa8r6TWudknI+PtFGxg+1c8pAY5TC86SKXuE2K3fJieX7WOZ2YpuGuWD4em92MyC7U1acfYigQO/9yJaYW5rCUdZ042o7ahH++8/gK6mo/CGiIz/UxFSUkaXn3rA+6TqXj7ICncVUBrixUPrpZQPO0MfnHfOUzs2wmpIw/VkQLsOd6GhrZeKM0uxBq5hWVIf0+XBEM5gchykJJ3wRYVPQHCouCGYsuMXTcRIUUmnW5vpcOJCMLrc+aE8dxYdghi8/mEHTkZXDR3Yg48qeLIuz5om3PxH8FUaBMNKhIJfLBlC8mTFfeWNuNSDuk/3fI7Ktyp+MvVN7ILMqKUhrxzmvBMj4H98nnsJa1umd6CiS2pONLUSXt0HPjuo7j5i//BALfBFJMCM8/zGLKYVceCOjv2HaBs6L3AOTJYkUNWEx416jq+RjJQyihjJibwwmV+yioGWmi4h1he6BQbCEQMbkXt23kEamEAp9qA5p1lyLmSewbkLiX5OZDZSGvWbcD/7KpD585z+PPuP2LpLjpyqYSfzehEb28r3pe/ouTfhstnVeHNX6zEmAdXIjs7w9xI+4/tW6B1C4mf7lWIueVMlzVNlEdeS8MXX9oxs5jNjNK5JISyAGUWAZGi0XBbSM2y4Kn5/dhbMQQHi3aQQ09JNpU9t2Jyd4EsQvMsySvGtHUBVJRmITZ0oX8YukkTztS24j/XrYOzK4ocdxzlkypxltPsQvaNjo79qMVmRPQgJ7IMHKfUs9S7DvPHjIdxkvXJ/9NHu9C+w4/gnjjkfWKHhHj+0ud2uFhA/02au/xOjn4kWGK7U3RoKT7CZ26dHkDdrE7MYkdtpkZflM1OShlRNCnNbEaGOQvEWTtX/UHBl1QX7HcdRtEy7o/R8jSStf1H+/GjSw/hBstP0HRiK55b/xKG+ukcyjNFGw/jS8s+uIx0XCSYGvlEZIhOWR3EtPQsyMcJAueJmA0k/sf4+u63U5qt+VJxxVReJCuBADfUCnji2SG6ilOULnB5SgTrCqgYk0G2ay54WRMuu2ymSyJpjOLnI3tmoiGl6k7cevY8yjOrcYm6ET/7YxlSCmUTScIM5xVZK/DSsXvgirhxXecPIMXeQKO+By49/Rswbi5E9BaPBf0vhzDzllxMKivEno5TcFvVesVRkNzkyVV+IkhUJrvjs7U6Al9Q80lIyGJRra3oxTQCVSOrKyvFwp12i8lbBHbHEyO7loK7CHlFYyMa2Ue2jOhCtgJ8susM6mI3mtKiws9bOwewcg+lwstySBDjGFIG8Xn7u6QEb1B55qRHbDb0b1Ibk534NRa3E9GjGq65uwrbXjrGvTjUc5vVmOcwtN0qJcIhzhUxcjvVbeDn1PhvdUVNpuimipqVajEvlfwH3iI6t6AbKhudoNnJURERFEQcv7/+FL63aDKvP4wz56MoLM9GhPsGpnzL1v1vPXciev9rOLYsgJmrHkdQLoa5rzvqId65OE/k3ZiK3tNUClNYoBZlnBy9U9ozzAzpOUJ2SUK+rGYYX13Ri/lEpTbJhdx0i0mFdWO08SM7lYJWmx3cqTKb5Au/n8DX+SuaYkGREzvuDmKy9Q8oz3sHNksKIThi2id2Z8Ult036K6bU7Yc7twgNt9xBMa2ZbFb+xgLE2O1rj0LmODrru2Ox8EfV67a983DzyJ3WGp4pxbHd60p93B6W0UMmmUua66aOw5T+ht4jSF3swigpNvsukrCLNZC4MNgLddrQRwZ9uTEFDccHMGacE1nc8FDGJhALj6gXIv3yily4e9w2TGhOwTu/uQztjnbMeGQtWSmdKmVAzHsmxxELtmlYEJ5Z/xw2zIf0wdDIpLz98ei2VXe/2685HKpLnV1Itdlikb5OB7FKcSONixBeFosQ84HxL351oFgsI9HgABClgYl+FTEjgcwyG7jjijANj3Hj0MUFWq0jURO/kehv82M3N1iu3TmAzNY01K5dgfayfDhaW5Hq76RLIxQamK5aYl16S8qd9Uuf/fuPPUY/Gs77ijlGriHAVPPrqRc/F+giclrkemykii8M8MY3yk2jxDgi8BI1hknUQkIHtZh7ZyBNFlGh0g7VzlqjRxUzikRAwuVNZX/BGEqLU6XJXHwz9IKpqFuYi/PjYs26v2dT+rFzH7RvW7BntL3/B41Ezp+M4ooqAAAAAElFTkSuQmCC" -} \ No newline at end of file +} diff --git a/agent/templates/chunk_summary.json b/agent/templates/chunk_summary.json index c945dee2eb..c3c17ade44 100644 --- a/agent/templates/chunk_summary.json +++ b/agent/templates/chunk_summary.json @@ -115,15 +115,15 @@ } }, "downstream": [ - "Splitter:LateExpertsFeel" + "TokenChunker:LateExpertsFeel" ], "upstream": [ "File" ] }, - "Splitter:LateExpertsFeel": { + "TokenChunker:LateExpertsFeel": { "obj": { - "component_name": "Splitter", + "component_name": "TokenChunker", "params": { "chunk_token_size": 512, "delimiters": [ @@ -183,7 +183,7 @@ "presence_penalty": 0.4, "prompts": [ { - "content": "Text to Summarize:\n\n\n{Splitter:LateExpertsFeel@chunks}", + "content": "Text to Summarize:\n\n\n{TokenChunker:LateExpertsFeel@chunks}", "role": "user" } ], @@ -198,7 +198,7 @@ "Tokenizer:EightRocketsAppear" ], "upstream": [ - "Splitter:LateExpertsFeel" + "TokenChunker:LateExpertsFeel" ] } }, @@ -322,11 +322,11 @@ }, "overlapped_percent": 0 }, - "label": "Splitter", - "name": "Token Splitter" + "label": "TokenChunker", + "name": "Token Chunker" }, "dragging": false, - "id": "Splitter:LateExpertsFeel", + "id": "TokenChunker:LateExpertsFeel", "measured": { "height": 80, "width": 200 @@ -338,7 +338,7 @@ "selected": false, "sourcePosition": "right", "targetPosition": "left", - "type": "splitterNode" + "type": "chunkerNode" }, { "data": { @@ -386,7 +386,7 @@ }, "presencePenaltyEnabled": false, "presence_penalty": 0.4, - "prompts": "Text to Summarize:\n\n\n{Splitter:LateExpertsFeel@chunks}", + "prompts": "Text to Summarize:\n\n\n{TokenChunker:LateExpertsFeel@chunks}", "sys_prompt": "Act as a precise summarizer. Your task is to create a summary of the provided content that is both concise and faithful to the original.\n\nKey Instructions:\n1. Accuracy: Strictly base the summary on the information given. Do not introduce any new facts, conclusions, or interpretations that are not explicitly stated.\n2. Language: Write the summary in the same language as the source text.\n3. Objectivity: Present the key points without bias, preserving the original intent and tone of the content. Do not editorialize.\n4. Conciseness: Focus on the most important ideas, omitting minor details and fluff.", "temperature": 0.1, "temperatureEnabled": false, @@ -451,18 +451,18 @@ "data": { "isHovered": false }, - "id": "xy-edge__Parser:HipSignsRhymestart-Splitter:LateExpertsFeelend", + "id": "xy-edge__Parser:HipSignsRhymestart-TokenChunker:LateExpertsFeelend", "source": "Parser:HipSignsRhyme", "sourceHandle": "start", - "target": "Splitter:LateExpertsFeel", + "target": "TokenChunker:LateExpertsFeel", "targetHandle": "end" }, { "data": { "isHovered": false }, - "id": "xy-edge__Splitter:LateExpertsFeelstart-Extractor:YummyGhostsTypeend", - "source": "Splitter:LateExpertsFeel", + "id": "xy-edge__TokenChunker:LateExpertsFeelstart-Extractor:YummyGhostsTypeend", + "source": "TokenChunker:LateExpertsFeel", "sourceHandle": "start", "target": "Extractor:YummyGhostsType", "targetHandle": "end" @@ -492,4 +492,4 @@ "retrieval": [] }, "avatar": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADAAAAAwCAYAAABXAvmHAAAACXBIWXMAABYlAAAWJQFJUiTwAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAA7ESURBVHgBvVpLrF1lFV7/v/c+j3t729tSClZCigXFF/JQE21CaKITHcAEica5UwcmDsW5A4mJTIwzB5iQiMTIwAhGQzQSrKJIomADWkpLuX3cx3ns/f9+31rrP4/b2xqNsJtzzzn7/I/1r/Wtbz12g+y6Nq7kB1KQB0OSh3KQY/IeXV0QyXn5Xox4JZEQ5BR+OhUr+fbBYTi9OCaUDxs5r8umfAvjvy7v8pX9lfRvuP5g/Fznpe/fjVMc5GC4KGU2hU9b8izWu1vepUsFVi1nCD4XOha5YPLFw+S88Bkfe9nWcKFPhVZO8hA2H5p/t4RvQ5YJ3qfYfdpl6Vz9lQsfY1aoVPhWQdJQXrgfgonMt2mYmwGf7k4NZOZvGzv5WGrlH/J/uqjlhB06mWOaugSOZ9qm0LxbJ8LILvqAyohBwXCv62Q4QU42M0L0uGAlDDtZp85O8r9cXIp7Usu0c2vAVqEqOuCS0PauQuOUHDNeBEV2APl8npFTGgg/4RedE0wZZU2Rh+r/FjozocWwnJLdzS4kNVfPhM66ZcX9k1mHcGqp2jAXRaG0hy9z6YkeIsjU77X8PhfmwXDhym7yuvbVFngk8yjOrKqwDA+auUiTjGlaP2hKjulg88QhwbkhOjtxUVi0wEatgVfNw7uV6B88FMfX1xO44Jnazp1tqJMqM3mRMwY6XFDOznhvk2mc83QRh0oVDehVsIl1DDNqad25GzEnnkB0Qo2bFAYjE9EiAV86P3x9TaFJd50oBExos9lM08BF5OBg9yn0GBNVyXl2MmcZ+175IdxlwEpirOTxgHPH2SxF8fqYMB4lxeYA0jcucKuWCdIrFpgJzYmFOajRmSMGv5cxwTWSDZedUmPRpMPJfUEVTh/wzx1kmXTcJ8FJo2qRXpI786cpBnBfKivAnBvjIOcuiawNsxwAfvfvMxKgEinrlMp4Ez7Qdtmpzk0splmFqdj37EFzUhwyzSNqED9oMCwbv5vGqWGu35JFqCAeBHDsctZgRX+qYKVhL8qwMcEmrej4BlZeGwR55WySmw9EaXF/fdXSCyqP64Uzl7N6ZMGllM2doAn9tgjtjut0rRYhPConlMoRxbHTZBrvDFMQymiQkyk89xjA/isQsAWwt6dZrVpR+zkBMkEPOMXmB/cF+dvZVo6s1Wq1tZ6xhir+LVhAFU8BHD6dzCGVHB5pJrQf0IWecb36gEVbHjLhz1QZK+piWcwCBODKIOohtzCAAtKzAjRGeDYYvn8l43BRxnCsGPieZf8wyN/fyrK+EmTYFxlWJqPSaHQtU/DWBciO9Rk8qmWoRJMZZnXtQmsJuMZffI8KxZD8EHj1ENkq3NuBSSdtmlmRizbQCK3Zg1aGyNzWIewTL07lCx9vIHSS22+MOjbjkJvbQf3q0JpRaz0N5ojZo2NwKFWqbjhtMFyQ3wMpjfQILE5oHTofnZH01kXzF2iMTjrGepzbRIvQV8aGfVULxkQIUymMIiABrQNKPz7Vyi0Hgvz0pSSPP9LI9389kc1JlHtviXJxBMvhoJex9irGXt6GVWCNOHWHyTErLGoKDS1gXekVGqSFIPgOKG0TPDfiAfDaxOm326xQ6/C+Bc+6MqElgzrwCL9f3AFUxjZeU2PgftAkObwSsb7IX8508qMXkzz6zES+fF+Uc5tBHr6vkm8+PZGvneiZM6s8QXbAIO87FOXtzaQ+SxoO/7oECMU5vosvkFrbzpyxhTdmwiOb8MRe7TzO74wZg8ZwrTRJ1oG26T4NhcaC+oJW1gdZXt/I8uQfsjxyT5ZfvRp1j2+cFPnOLzusGeTkHUHuOhrV/146k+SeW4OMp2J0jc37sNiFTdDrwJ24MshajpONPaaOU0bD7Za0lzRokBm2W7NI7XzPg6gfuNNTIfztMCjvZVDg8SMRB8ny8psQ/I+dfOneCrSZ5Zk/Z/nM8SB3Hgny+9eT3H+8lgNDUQ1fHInccRj+0mlKKk2TFYK09hBauQQk7Ou5EzPkt0p55OVkQQpCjTtSpGl/GxQz4UIVgw1wPqUf2EGUofBnMAAt4pSn32nl+OFavvdcJ1/5dCUHEYieBExuWs9y4rZKD0EK/eJHKnl7K8udNxnUpmQprH3DEEFsC76B9wGEpUWraIqhr049dqyCTsMbF3NOHr5png4LVDgQnW6nNUgQf8UPxq79XmW/E8cNPlMbv3illRMfrORpCPj5D1WyD3T3879iLfjO7UdQ+W1GOXulk4fvqbBmVM4/OEhyaWRr0KEHdVKokhTqaL5I4Sk0uWjcJSWKXmMyRI2oyQISMbqGTQErOKOnD/i3A23TpJt4TaeEW5bHf9NCU9A4TYv5F7aYo1Tywmut3H97lB/8tlPtnTgGwXeyvH9/LZ+7MwD3lfpQhMnWeknxf8MKIiys11fhbde+sxSVtQViGIGNyHyHVk14yaZcdWLNF8k+oKERoPLaBct+Jh5NYwkIGHMzaO4izP4aHPED6wFMZEU3zUtn/uHvRD55q6gVD8IHju4PeIE5trMqatgPEDwr1GgB8nIuuVYMnucUMoBFIOwQSqIfMq4wcvdqMpwF3VppUgwipLoLW3AQaGzQD1ra0bRqKqw8BPU8+rOJfPVTNWjQNEPqHSUmeVExedfRTj57rDIYtBauzm1mzfNvQDI2AButNpaY55IAegozcXqO0QIX/Y1WzErLIIhoySJnU9kMospCvLU98fQWwrz6TlDWWEVE7DrDPVOEd7aSvHIOGoQAq4Dah4HrTXDzPqQGhNwAsIg134N2GTYn5hs0eZ9FCBSxqgLNk0diewxNEwfRqznmpoR0v7YYwFgwmVoKvor1eI/s2JZoTmpidOWiByD0QfAr04pLgMpl4h4njZhNS3wUjPH86SQfOwptwoOfeDHIC/9kbeAlJg46gWM1cDQ0oaSPoNWIpQn7Gk31VPxRSyrke+UpylSjfQU/GEIh+wCVqjZmZPQnjRrsklVtXguoBUYTOhPZB45TWyHx/GnRJGp9aFYZtRY5iatDQ4T2cVJTHx6SsbI6OrVNndRaxFgE56H7zDobsySxramEBp7O0xbs2US1UMhWsDBDoJxkoaaxd0tWli87ALxmMuUilqjxhOe3wefnLUkb9ixl6HtlRV9IweJBla0HUkdbmokZWyHcsIZCmrrS36ggK947nUu/6yGvYDwoiSHxT1ailAo7T2lK8a81BKuyMK9F+Fk5OcXCBpaFruOX1T5xbKXeIEaLgtgwQaDR2MpKmrUO3udhskc/8Hq5Bga0WM8UxKqgCG5cqRvVNq9xZ8lhciZb7RncyIrZy0zN6MmGwQorjQmV5VvhzJWUmTWOJ97PsXRDg08PePzT2VqmgBgZJCEyM73vBbeA5u8mfPQqnywxxMGowdJXUCjBEk1txQ+zA8KzbS0BIy0OqG2vu5OIt2vs8FY2m2IZGzguutK0IgteZe8A1wxmXOfyyALUDjR05lJE8gTsr4imsBq6xTJVa51YK3CtD+1WSe+TRusqanSNtWGbTDL29gq1TMg2XvSr0F4IKaSy5UC+lflCtFqZjFV6p15SWnlIjTC7JMan2WBCLW/AGm9dtiKc3EvaLO3BFZicGSb5nRilX/Qb1rPWfbDE0DoWTAnUqRurLcjtTL1zMkuVJpl17uxgjacq4jnZ7i5WeGPD0ungNe32xIJOD0dmXLCSLyg1nt+MyiRUEdMIpgIkT0KiD4ekcApBKZHUDqB5TmN1MAOQp10e5oMmaFrGeuOrroOTQNY0IKUou6/sVgpvMJVIppHoPLU1TnoAb/nA9Eyu6EmtLsy4q/UtoNFnzq+sJNpHmpQEkGZvsrHMQt+wYDvxA6yl1SBhUxmZKJWWdku+Sm47fLBUhaShNMpNK2cF1q1TBJkdUKvi14v3WZhXdgE9At8hF/orbROvuGp7N3osjQHzC7p7l60c5Td17Eo08nMf7cZdJTTGBzMt02iSju6t2aqnrKygqsoaVnSuDhuxqGFgI9zryoOJrh8N297vYUrATLLBAGK2OGSbC6azYpQsxnqD1Nproq9nvSIVPO+CSJo/xWnckSmAte/5YzQLaCLXWYJFrq1cCaOJZaRNLF0E6zp0zF10QUuqepW3Bl1wyd7ZyFb0U4NkLiqjV3vwCdZI2I2S4guWG9EXSztx7/G1ndYgwbqTuFYqIz8jNYidWad04gLpr2f4Y2HdLbXZDc9aP8/KSz9kP+jx1RKq7WWotLlQpzk6g6MGrmQN3flTgeVrlo3ytJ22+mTeTpH584Au2XMsNpo0Z59p23GarPXVOpOQMpva1o3XwrZr2xBmlKkw0edoMiturnUZU5XFskdJb8gWK+SFwdl0aB3l4Bsl8a5bVGyTAntaY2T1YuX0XXZvHV6aHmDTQaFfZsX5aussabz0YLOlF8vtdYb8yHTYqKfNngGGmbx6aeuwk9nTRgrL4EUGC154e2zaU9sxWN6kTJVZhwdL4iRcnW7u0jgX0v2jDa8h3Gnse0w3ScbZtQcjzd7zvOViEHUiFndiL/ipDuaYeUHogjCmJNnX7lU2j1rpsiy05pevsr+25sXK2tYF96Yh752KiDVPLU7M2R5vUpuWf2Q1m2nfn3kh8Rr0LM+3p4lBhV+8Zm31TtTZ2W1mJquMkgqMrhbas2lVYs8F1w65a1wfkiQpD0lOhfNX8gNoKz6b93AyfUiXwsyRzdt9s3S11mba9uc6zBybELz+Ddd9Mk/6pHC1zIkj+XMz1/bV8rVyW7xxLTxXpfBYWTcsPFBOnqN0IdtiIc8ebuzWXOnoaS2rPZ6gwrcOE5uyNxPxGrjw3Id9Yr4rhNLewuPWY3hSf1pX3NjI67mRZ6f+yLUcIl+HDWxzOxApsCpPZoLxf8rlicJe8+ZPcbxKVaFdMBU4XMeZcbJTcbrwXw34ATX1SYj72NK4kJcssiS4czSDDjVeOzW2M9qM19rcfMsFTQvCc6uC72td1HwR3pdbvt7cyceqLj2Knz6Br2qRRUvMtBesn6S4l3xd7i6CK6MkrVPcYa3bEfN/mJrlNPTyFIb9hJBf/O3fQ3B6D7564aoAAAAASUVORK5CYII=" -} \ No newline at end of file +} diff --git a/agent/templates/title_chunker.json b/agent/templates/title_chunker.json index db7f731144..42f3c1a581 100644 --- a/agent/templates/title_chunker.json +++ b/agent/templates/title_chunker.json @@ -74,7 +74,7 @@ } }, "downstream": [ - "HierarchicalMerger:BusyPoetsSearch" + "TitleChunker:BusyPoetsSearch" ], "upstream": [ "File" @@ -95,12 +95,12 @@ }, "downstream": [], "upstream": [ - "HierarchicalMerger:BusyPoetsSearch" + "TitleChunker:BusyPoetsSearch" ] }, - "HierarchicalMerger:BusyPoetsSearch": { + "TitleChunker:BusyPoetsSearch": { "obj": { - "component_name": "HierarchicalMerger", + "component_name": "TitleChunker", "params": { "hierarchy": 3, "levels": [ @@ -279,11 +279,11 @@ } } }, - "label": "HierarchicalMerger", + "label": "TitleChunker", "name": "Title Chunker" }, "dragging": false, - "id": "HierarchicalMerger:BusyPoetsSearch", + "id": "TitleChunker:BusyPoetsSearch", "measured": { "height": 80, "width": 200 @@ -295,7 +295,7 @@ "selected": false, "sourcePosition": "right", "targetPosition": "left", - "type": "splitterNode" + "type": "chunkerNode" }, { "data": { @@ -334,10 +334,10 @@ "targetHandle": "end" }, { - "id": "xy-edge__Parser:HipSignsRhymestart-HierarchicalMerger:BusyPoetsSearchend", + "id": "xy-edge__Parser:HipSignsRhymestart-TitleChunker:BusyPoetsSearchend", "source": "Parser:HipSignsRhyme", "sourceHandle": "start", - "target": "HierarchicalMerger:BusyPoetsSearch", + "target": "TitleChunker:BusyPoetsSearch", "targetHandle": "end", "data": { "isHovered": false @@ -347,9 +347,9 @@ "data": { "isHovered": false }, - "id": "xy-edge__HierarchicalMerger:BusyPoetsSearchstart-Tokenizer:NeatRadiosEndend", + "id": "xy-edge__TitleChunker:BusyPoetsSearchstart-Tokenizer:NeatRadiosEndend", "markerEnd": "logo", - "source": "HierarchicalMerger:BusyPoetsSearch", + "source": "TitleChunker:BusyPoetsSearch", "sourceHandle": "start", "style": { "stroke": "rgba(91, 93, 106, 1)", @@ -368,4 +368,4 @@ "retrieval": [] }, "avatar": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAADAAAAAwCAYAAABXAvmHAAAACXBIWXMAABYlAAAWJQFJUiTwAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAABMaSURBVHgBbVprjF3Vdf72Puc+5s7Lnhm/x/ZgCHWAFLdVKEqogJY2ikIJEUSBhiZFIaVRKZj+KFHa1G7apBS1KlGpkh99EEUgykMiKlFpFCUIyS1pSTFCQLADHmNjjz3jec99nXP27rfW3ufOtZOxru/cc8/Zez2+9a3HHoMLfv7skjuuy/Li4wbmZn6c8t7rdWMMnHOw1kKuyWf56f9dfuSe8rO8y/fyjFyXn/7fy+/llSRJeNbyWW5pZQ2+jOzvcbiW1A67wv3FwelHp/vl7e18cOr3NrTQOcCb9/cLVQpcblIUxfrDZl0B2UffRSlVQp5z5ylXGqEnbFSg/935gsIn4bmoPC8i4T+qw//xMJ8URRZ7CqjwvvVDY5J9pcVE0J4l5Tbjoxg+COc8wtdiUW6a2AuEkZco7HrrqTAXeKp8uai0tWKsILzjq5Kk+rvhtcRENbw/nAPXixK6Yst1DvChfaUAsplYqTSzRxGFFeFsUEldbPW65b1BIYuAOLm5vGZU6XK9fkiWgpfXre69rlgqcKMXfbhB1ynkszH7qkgP6DpfnLptiiIc+xlXRryLRZwzUSgfrBE3UGGCsYKgIkwvNoJFXQ+O8nLhnd7xbh3Fcm+aprqn/B7ihEpH+NJMVM6qBxROGhv8nNrr6R97oMR4vwLBwtS4KGXyKoKJiyo+uai8yz9ZmDsHTIoBihA/Vr9yEXrBa/3wRLAvCnlAl0j5lqvysk4JOlnDYj3WqCJcbm5Oucw+9LmzDFoIbHxF9lPLCkxUEB/CIeDZB6bgPy6G7rJHa1UepZUq0BvTmkc3p7I0XrfNbQuiV7DM640hoFI3PYsHAnJBaBNgozKlia7psG7kQgnGfTylCL3ADewQAtCaCq3iVFi57n0ZxxSKgeW4YLaWYHm+wMQu4PIbEuy5po6pq6sY2lxFbUiYhJtXJOC76sW800a3VaC1lGH+WAevfx+YfsngnVcdOh1gYrMoI1okEQnB43mWhz0FlqqZwInXvZ9KSyT6qK08ldBcOYOpYgOG1SCBj8kKFcyeLDC2HfitL6b41Ts3YnCsyuclaMUTEqxpQIawTyYWr/P5nJasoz7kUB8BxnYWeN+vZ7ylzRubOPGywXNfK/Dysx5jVKQ2ZHpxkiZJdEjwgigpwSxBzyD+Hd+fUKzC2vQeEOsrZBKDueMeOy73+My/jGPylwaVZfIOnxNLwcYArfHBNCakAM3CdSKfFxSgUHgqMn0WmAti2Sbh1ETW9njij7p44Z8Mtl7MjQuD3NEDgkFCz0aCSAQRjBfzJ7tu8/38LIuLNW0P38TumlGP7P+Pcey+aphYLoKwIqCuWGbeCoUSxqgqjeqLMIKwmI15hEIrNjTKhXWohFKjC5jPm6gNLmPpdII//UAbI42UMPS9xBYBQcNW4POuGGZd+BC8gS4Fe0nF4MxJh6tur+Dvz+3C9l+s00KF4tBqLhAz2kiGRtlIc7FxCGGWR5bxwUO6vHjIqjVBkoCv8zKvoaoGSNJBenUbGmMpHjlXR23MIWtCvR1ShlcP5HmHy9DQJfbLNG5iGqd3cOJoji+9MIpbH96C9mre5yWjTlcLK3UmgszoCeFNocGcG7XVo/QhBchDHhBgm1TjAvSUsXUuMUAv1PhkVVJeVHgcnZUBHHi1hsYWmqMLJQUJYMn6gXByiYP1pJQkAcc2yXH6bY+HT2zC7g+O0OqZJI31hCWL9FQxIT0pZYQrmhlc4H7hdNqL75n+rt4RZeQVadkrxGp8Mg0wdMryvDSK9nJQIpfMzF0phvyvDAlFZn+1KfydOJx6exUPvTNAOqxrginpUwJTaBa+QAB/zF8avEmkuBjLNigCTUBiUXmGQUxKVY+YbogPUUrjqIjJi+ukkn8SVdRWGXOrA/jq0RSnT9gASGI/JUTSpBo8UP6kaQ0nj5/CAz8YwshWWi4vgj1NSaX0DuPTVEJG1LRvArVJUJbcDR/hHmsmX8a5QFUh5VRgR3ZSr/iuKufFMLoW73Oh9lRMpKM0bB33fzfFwntSQlQDCRRFTHpRifbaKq65NcHe62twmVjcoxfWQia1HN3/ZoJ7ja6uZqqQjym+lFxVtSYKG12EsqI1amWtVK1Rz3ifa5wInuV3Lx6PPtPiORQQvD6GvR9hsvwVq/obk4ZCslf703ULi0v43GMJceeDlWghNZaEaIMlwWND8Ic3onP8MnS+OwE7UESRgyAmVnaa9MI+uk6onxxClZwHWq1KVoSSRcgLwWCI0AtwsiGe4nveGcPvfstg4WQwqhBOWtJnc6GN2/+S6bnDJCG1h9zilkk0tUCRtEz97Xk8Pd3G8OvL+MgHh9Tlap+kiAFMjs9imS0QkDRQkYDshAAVtDDJoc79/nqNXmzAjo7x+Qz1O5cgpKV0K0nLB8MZG+GictYwfkkVu65sY+1cTPbhC2BluYXr/jBRo5sB1i4NWm6IbFOZ0x5AIr+5N8dH3RFcu4MiTS2jLIWz13IUR8gUL5HTc6+ZVZKUm8/R+ocM2ROb0f7HDXAzRulVY0NMVyO77RyDHaoBfQSgHtLE63WdkrYlfopsENfur6C1EAwfPZDgihtaqDQqrCgrWLyXbDTWgFmqo3LTGmrXnIVrT6DxKSaZD+/hQh0kO7h4h0ueoIBP8769m+iwFtIhMsQVThMP2ty8VdUaCBsp1/wqksk8JLm61yDUGHL1YEUtF6R65B0nqMQa8wLvSd/Hmollp2ed5fIBXPnJJTx2l0ddiEcy8dKcwyfvJKF1+fACrfpTJpbLKrDDQzDkYVTP8Muz3HAUlUlCB51o5Rqtx70Z3KgSTiMENStU1yT9DZP5zwSaLLot3joA985GZPMrsFe1lNkMS2tTSbRg1PpLvHaOBnl8mEmaMBwkfCfG0P7PWaQ3riK9lFDM2IvVath2aYHOWgzi1lqGi65OlGmKFaJqwCvmpW5xJ+vIn98KP78RrecXsPild9H+9jA6j04qBTYfYj0yxCTVIqZrxPvRceSPjCP7cYLOE23kSRN+cQEFGc4WTFJHqfFZ0uM4PdIlVbeoXB7YSj43H+Q64/x+iJ9ZelvmjfTGK+AObaMsbHZMoeX2jl9mvHZDjkGNTcXoFqkMCix9jiuPtAiZDvKVReQnV5AdGoQ7shHFu6TQt7jpm6S9I07rFzfIjZjn3fwZZEvzsFwLW1kWMJvaBkOSybForwlHI1+ch1tqo5ghxCq55kLDfZyGboLuczTGFqv5xxF+tVt4f5fB/cpRXk+QvUg4VmU/i0t+zbC8EQ8wG6YSTLSe9Jm+0QrU2WqH9q5FBWbm4Igiu0XqfRfoT8KnWEb3++eo5CotTAaiIvnyIpVaQcJEiPdzHSEFF5KTN02kt88i2dcOI5p2F4WYX5NhB8Xb9KRlXHQpzy8Q95u4z5UrVIJ7Dg+gOEuSyGSa0ozVrQmdh7R1RlxT5YXrFlH83wjSDXGKQLkGPk+B9tAb38tC+8fOyjpudsZg4sU65m+it5hDLC2bvr/A0KcyureKwU/XsPoAE15DWIV7NBh0O0KKwTDTVJUlA+PDJbEIIft4ZvmCcPT/W0XruYwC0rugTG+cRn6Ke5/LVTHJF1IBSAeNTsvp5MGteIwfaGHmY4Mx6TAWWBvVrpqhcTfpgwVd6+n2bG0O1VzqGEKCyhta1Fa5wYzIRwt1aZBkBe3DVHa8huomQmZMatY1LZ+zmSY6nO7U5hhJZ1mHfrTNzy0tVQRCtcmthNgy8qECFRF4fB5VNlF2AzSTv/tKjsqADhbYrGRJgIYMpyQ1DBKXnSxMJ5h4Cr/C9zmt31Uxuth3GaCLzBcp7006yn6av4tQWkhPI6ySWyojGUzY5mzIzKJ0MsHnzzaVJs0EKZuUbHc4rY/ABqbzkzkM3gts+HNS9LWDKN4cw+Bt/J6VsWTs4y8TahVSbMYFOq0EK7MOg6NpUEA5N9dJgF8KDbSrrdGSpL+V4ZBsBIPLRtO8tWSrLgMwJ94W6fRXKSyhVv8AP9cC3+fzC8wRG7D85RFSLK9/aE55X/KFXWshf9dj+L4aZm9hgtxtUB1vYPFuqVAp8KAQAjD72Rzj3yIdM1n+9MccJmyXYk/qbGaEk69IH+C0bExHwzApW1lCZdRg9cuXYvUru6kA+b8IjY2tMXxcNXQGjULHJfnCImG3htW/IX4fMRpi9YtZGHZYPmcM8MU5ZCcW0ZleQTrFpMkmqXN6Bu2ZU8jfY098kUXjrgbzRY5sgcw1wudG2wyNDtpvNVH/zdB9vPF8V9vMnHuS5XIMb/T40TMDSOqxq9oiKZva89UkPeYn1tB9nYsRw55VanduHt1zM+i8tSLRiIGbqsinGYz0QLG8yoXPoZhd1e5q8H4q80YHnRmuwXqrOMWAn2a9VR2Af69AdqqF7hnCrE64tXOMfmEQm57agOoV5HnXQpEwe+9rYvxpzpFuJ4ngFJ56IMGGTVYJJfVW23b86Nk67vrXeQqQYPR+BvJVzMZ7ibGBFNkiLXySw6h6Fdm79FLKYGMKrjJJOTYXw5/eSGWXsPJkUz1XmRrEwC0VrcEqmxrYfsig9b22xk5lNwP3QyPaju48MsEYypU65bNrGe17k51VjH+NuUTgIx2ddGhNBn97Ecff3IFT73SxeZsJY8t7dn/Cy3zl7HsGX32xicnLCRG2lORBrD7J4Fqp090G9RsYuCkZaI3ZsUFGkdGbVJorYT5qBya08gzkRmxnQrdhhqOznKrT6tUwi0pTZmUWKgFsE633VRiUg0YTSEEqPu3g5hW6tdEE909uRqPO/JE77Y1TmdNI1pzYZvHsQzX88dMdjlGUXTH0GVIgs7OXEqAttQvdNsDsm8vMk4tnTvldq8XOLJlEMtxGhBrRxQTldH6jVW457BWl8kJrYR9LZc0vvtRAUjQfKGaVIArOhqrDBZ55YAK+SdarlCP+MJlRHvekukPfSTB7rBYsJZVhu07XyVDT9pr28J1Z3zj0mvqbNi3ClcWMDqpCH1kJgwgbuiw5vNB3W1HYGCvUnMR2knRsyELFe3xmVntmR0Ml9N7xlzfh3/+WcbDBhi6tPLu4e/dv+3I2L8YsWN9849gSOsuxK4oNj5YVYSxRdvKhrkec8dv13tqqpcNMSEoOJ7WHvvrH7OGl4xcd+BZxkFaeIwSDWXqiubwV93CUObmHeMnXzxaw3tAEDJL2sTTfwTNfGUGN5XARRx9hcp2tN8cWsdU0ejpTCu/j8ZPTd6tDrzDxliZmVV9MHrxR3ltctxuE13lmGsUxITdYozmgm+/CfuaFyT0IBaI9/yTIlick2pny95Ex4Nt/1cUP/nmCg9jQcSHCpsQ14kymHK+XjZQJg1VVSC0kQy8bsjNC4RuGYOWYMHpOJ0s2Ch8hadjWrs7vwhcmOti2KxyIlOdz60dS8fzgvPEib9zGbuvv7l7DoSfHGPmpWlmtKzjtwcWFXjXOs1x5nGTCJFvrKHlOsRbGgWH86OKcqZxkmHh0EkoQySXV4XG8yqHBvbta2Dkl4/Vg5DDm9+cdT6nk/SePckPOVL2bDz702Sa++fvDqG8YZeYLxz/heChbh1KRx+MkH87ElP6CYOKR0u4hiONJjonHqAFwMW5yJIMkDTeJB1nrf/OODmFDGOcXHGn1/Zzngf6jVR2J88Ht2y0OkVbv2EoWeG0CtZEG+wYXBDExEJM45NVhfpxOqLfitDlu6uMYMdBuPIlh4EoZUx0awOyJSXzj1gYh08UCx/gbNzmNof7J4XnHX/Ga+fzOjx2jAlMXnpP1/yTc9PSMxIfFJ+6p4Dfu7HByF4a2RbfQDkkSlMjv4sFGSEZGPSOFn01lEhEn1aTOrF3F6TfqePmpKv7rcYf50w6bthht3vvZr1+W/jPr+PmwuXvqpocZTPdpIpEpQZ+25an6uubkjhWDRTb+W5mdJy9OcNm1wJ4rqdxmh6Ex1u41E6bIjqc8rJvaqyzaWJedOcoDErakM0cSHPsf/n7SabW7YQJ6jlYeHQHnH7/2jz4vPKLl50fNH0zeeB3d+sPeGXHfue3Pw14/A8hkrLPKcrwjR0m836WKnCSOyKUMlya8SkapsdlJGUfVmnRiYVyox6fW9M4lfCwngggmkFyk6PKwvPxrgbC/v0i/vWfq5od5NnBfUSYqH2ecfVr/vL9x6FkG0q8k8ShUYJ7G46UQJjaumejELfxuje0dbocgNnF+2r9vaPZ1vBgR0Qf1rz84/fh+BRu3O0gMH05NEvNgSfP+PDf2w6o/qGTzXIdUIQY4CeIJYhZO2TW7h3O03Mv1XGdCPl5DzOQuFnQlZNEL9aKnzPofkrjDdVQPanzKfy8t/qT94Y17/43pm1Mse3UYnOK8aD8/Ftb/aOPCv4HQQ27JpElpufL0PemdQ2sxF1mqnFgrw8bEmJh+9oFO8kIO0Cz99QHbvvPg9BPrf+zR/7N/6uap3JmD9N2VXHKfFnsuTJh7R1F97/0e6o+Tdag5hY4QRJpUNCsnNhyb2pgvUAQFbYn7+Hyyvs40vfYdDoyffXD6iRf69/p/CbMWUUVYM2EAAAAASUVORK5CYII=" -} \ No newline at end of file +} diff --git a/api/apps/canvas_app.py b/api/apps/canvas_app.py index 3585f81a21..8c896e36ad 100644 --- a/api/apps/canvas_app.py +++ b/api/apps/canvas_app.py @@ -39,6 +39,7 @@ from api.utils.api_utils import ( get_request_json, ) from agent.canvas import Canvas +from agent.dsl_migration import normalize_chunker_dsl from peewee import MySQLDatabase, PostgresqlDatabase from api.db.db_models import APIToken, Task @@ -148,10 +149,12 @@ def get(canvas_id): # Add last_publish_time to response data if isinstance(c, dict): + c["dsl"] = normalize_chunker_dsl(c.get("dsl", {})) c["last_publish_time"] = last_publish_time else: # If c is a model object, convert to dict first c = c.to_dict() + c["dsl"] = normalize_chunker_dsl(c.get("dsl", {})) c["last_publish_time"] = last_publish_time # For pipeline type, get associated datasets diff --git a/api/apps/services/canvas_replica_service.py b/api/apps/services/canvas_replica_service.py index 557579f38b..a2aa56b6f9 100644 --- a/api/apps/services/canvas_replica_service.py +++ b/api/apps/services/canvas_replica_service.py @@ -20,6 +20,7 @@ import random import time from api.db import CanvasCategory +from agent.dsl_migration import normalize_chunker_dsl from rag.utils.redis_conn import REDIS_CONN, RedisDistributedLock @@ -56,7 +57,7 @@ class CanvasReplicaService: raise ValueError("DSL must be a JSON object.") try: - return json.loads(json.dumps(normalized, ensure_ascii=False)) + return json.loads(json.dumps(normalize_chunker_dsl(normalized), ensure_ascii=False)) except Exception as e: raise ValueError("DSL is not JSON-serializable.") from e diff --git a/api/db/services/user_canvas_version.py b/api/db/services/user_canvas_version.py index 292b18811d..4224dd6a1f 100644 --- a/api/db/services/user_canvas_version.py +++ b/api/db/services/user_canvas_version.py @@ -2,6 +2,7 @@ import json import logging import time +from agent.dsl_migration import normalize_chunker_dsl from api.db.db_models import UserCanvasVersion, DB from api.db.services.common_service import CommonService from peewee import DoesNotExist @@ -30,7 +31,7 @@ class UserCanvasVersionService(CommonService): raise ValueError("DSL must be a JSON object.") try: - return json.loads(json.dumps(normalized, ensure_ascii=False)) + return json.loads(json.dumps(normalize_chunker_dsl(normalized), ensure_ascii=False)) except Exception as e: raise ValueError("DSL is not JSON-serializable.") from e diff --git a/common/doc_store/infinity_conn_base.py b/common/doc_store/infinity_conn_base.py index 6a95a634e8..c5d9eb48e8 100644 --- a/common/doc_store/infinity_conn_base.py +++ b/common/doc_store/infinity_conn_base.py @@ -225,6 +225,8 @@ class InfinityConnectionBase(DocStoreConnection): schema.append("SCORE") elif field_name == "similarity()": # Workaround: fix schema is changed to similarity() schema.append("SIMILARITY") + elif field_name == "row_id()": # Workaround: fix schema - Infinity returns "row_id" not "row_id()" + schema.append("row_id") else: schema.append(field_name) return pd.DataFrame(columns=schema) diff --git a/conf/infinity_mapping.json b/conf/infinity_mapping.json index 64356ee8d5..77d26dd960 100644 --- a/conf/infinity_mapping.json +++ b/conf/infinity_mapping.json @@ -20,6 +20,7 @@ "position_int": {"type": "varchar", "default": ""}, "weight_int": {"type": "integer", "default": 0}, "weight_flt": {"type": "float", "default": 0.0}, + "chunk_order_int": {"type": "integer", "default": 0}, "rank_int": {"type": "integer", "default": 0}, "rank_flt": {"type": "float", "default": 0}, "available_int": {"type": "integer", "default": 1, "index_type": {"type": "secondary", "cardinality": "low"}}, diff --git a/deepdoc/parser/docling_parser.py b/deepdoc/parser/docling_parser.py index ccd45bab12..a2ebc40025 100644 --- a/deepdoc/parser/docling_parser.py +++ b/deepdoc/parser/docling_parser.py @@ -41,6 +41,8 @@ except Exception: class RAGFlowPdfParser: pass +from deepdoc.parser.utils import extract_pdf_outlines + class DoclingContentType(str, Enum): IMAGE = "image" @@ -242,7 +244,7 @@ class DoclingParser(RAGFlowPdfParser): continue tag = self._make_line_tag(bbox) if isinstance(bbox,_BBox) else "" - if parse_method == "manual": + if parse_method in {"manual", "pipeline"}: sections.append((section, typ, tag)) elif parse_method == "paper": sections.append((section + tag, typ)) @@ -311,7 +313,7 @@ class DoclingParser(RAGFlowPdfParser): txt = (text or "").strip() if not txt: return [] - if parse_method == "manual": + if parse_method in {"manual", "pipeline"}: return [(txt, DoclingContentType.TEXT.value, "")] if parse_method == "paper": return [(txt, DoclingContentType.TEXT.value)] @@ -455,6 +457,7 @@ class DoclingParser(RAGFlowPdfParser): docling_server_url: Optional[str] = None, request_timeout: Optional[int] = None, ): + self.outlines = extract_pdf_outlines(binary if binary is not None else filepath) if not self.check_installation(docling_server_url=docling_server_url): raise RuntimeError("Docling not available, please install `docling`") diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index 8475087c84..25a0627ff4 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -35,6 +35,7 @@ from PIL import Image from strenum import StrEnum from deepdoc.parser.pdf_parser import RAGFlowPdfParser +from deepdoc.parser.utils import extract_pdf_outlines LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber" if LOCK_KEY_pdfplumber not in sys.modules: @@ -576,7 +577,7 @@ class MinerUParser(RAGFlowPdfParser): case MinerUContentType.DISCARDED: continue # Skip discarded blocks entirely - if section and parse_method == "manual": + if section and parse_method in {"manual", "pipeline"}: sections.append((section, output["type"], self._line_tag(output))) elif section and parse_method == "paper": sections.append((section + self._line_tag(output), output["type"])) @@ -602,6 +603,7 @@ class MinerUParser(RAGFlowPdfParser): ) -> tuple: import shutil + self.outlines = extract_pdf_outlines(binary if binary is not None else filepath) temp_pdf = None created_tmp_dir = False diff --git a/deepdoc/parser/paddleocr_parser.py b/deepdoc/parser/paddleocr_parser.py index c2ee805d45..a23852e89c 100644 --- a/deepdoc/parser/paddleocr_parser.py +++ b/deepdoc/parser/paddleocr_parser.py @@ -36,6 +36,8 @@ except Exception: class RAGFlowPdfParser: pass +from deepdoc.parser.utils import extract_pdf_outlines + AlgorithmType = Literal["PaddleOCR-VL"] SectionTuple = tuple[str, ...] @@ -253,6 +255,7 @@ class PaddleOCRParser(RAGFlowPdfParser): **kwargs: Any, ) -> ParseResult: """Parse PDF document using PaddleOCR API.""" + self.outlines = extract_pdf_outlines(binary if binary is not None else filepath) # Create configuration - pass all kwargs to capture VL config parameters config_dict = { "api_url": api_url if api_url is not None else self.api_url, @@ -409,7 +412,7 @@ class PaddleOCRParser(RAGFlowPdfParser): tag = f"@@{page_idx + 1}\t{left // self._ZOOMIN}\t{right // self._ZOOMIN}\t{top // self._ZOOMIN}\t{bottom // self._ZOOMIN}##" - if parse_method == "manual": + if parse_method in {"manual", "pipeline"}: sections.append((block_content, label, tag)) elif parse_method == "paper": sections.append((block_content + tag, label)) diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 5e8f9694a0..6717a887ae 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -42,6 +42,7 @@ from common.misc_utils import pip_install_torch from deepdoc.vision import OCR, AscendLayoutRecognizer, LayoutRecognizer, Recognizer, TableStructureRecognizer from rag.nlp import rag_tokenizer from rag.prompts.generator import vision_llm_describe_prompt +from deepdoc.parser.utils import extract_pdf_outlines from common import settings @@ -1582,28 +1583,6 @@ class RAGFlowPdfParser: logging.exception(f"RAGFlowPdfParser __images__, exception: {e}") logging.info(f"__images__ dedupe_chars cost {timer() - start}s") - self.outlines = [] - try: - with pdf2_read(fnm if isinstance(fnm, str) else BytesIO(fnm)) as pdf: - self.pdf = pdf - - outlines = self.pdf.outline - - def dfs(arr, depth): - for a in arr: - if isinstance(a, dict): - self.outlines.append((a["/Title"], depth)) - continue - dfs(a, depth + 1) - - dfs(outlines, 0) - - except Exception as e: - logging.warning(f"Outlines exception: {e}") - - if not self.outlines: - logging.warning("Miss outlines") - logging.debug("Images converted.") self.is_english = [ re.search(r"[ a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) @@ -1711,6 +1690,7 @@ class RAGFlowPdfParser: if auto_rotate_tables is None: auto_rotate_tables = os.getenv("TABLE_AUTO_ROTATE", "true").lower() in ("true", "1", "yes") + self.outlines = extract_pdf_outlines(fnm) self.__images__(fnm, zoomin) self._layouts_rec(zoomin) self._table_transformer_job(zoomin, auto_rotate=auto_rotate_tables) @@ -1722,6 +1702,7 @@ class RAGFlowPdfParser: def parse_into_bboxes(self, fnm, callback=None, zoomin=3): start = timer() + self.outlines = extract_pdf_outlines(fnm) self.__images__(fnm, zoomin, callback=callback) if callback: callback(0.40, "OCR finished ({:.2f}s)".format(timer() - start)) @@ -1969,27 +1950,14 @@ class RAGFlowPdfParser: class PlainParser: def __call__(self, filename, from_page=0, to_page=100000, **kwargs): - self.outlines = [] lines = [] try: self.pdf = pdf2_read(filename if isinstance(filename, str) else BytesIO(filename)) for page in self.pdf.pages[from_page:to_page]: lines.extend([t for t in page.extract_text().split("\n")]) - - outlines = self.pdf.outline - - def dfs(arr, depth): - for a in arr: - if isinstance(a, dict): - self.outlines.append((a["/Title"], depth)) - continue - dfs(a, depth + 1) - - dfs(outlines, 0) except Exception: logging.exception("Outlines exception") - if not self.outlines: - logging.warning("Miss outlines") + self.outlines = extract_pdf_outlines(filename) return [(line, "") for line in lines], [] diff --git a/deepdoc/parser/tcadp_parser.py b/deepdoc/parser/tcadp_parser.py index af1c903489..6a37f0befd 100644 --- a/deepdoc/parser/tcadp_parser.py +++ b/deepdoc/parser/tcadp_parser.py @@ -39,6 +39,7 @@ from tencentcloud.lkeap.v20240522 import lkeap_client, models from common.config_utils import get_base_config from deepdoc.parser.pdf_parser import RAGFlowPdfParser +from deepdoc.parser.utils import extract_pdf_outlines class TencentCloudAPIClient: @@ -392,6 +393,7 @@ class TCADPParser(RAGFlowPdfParser): ) -> tuple: """Parse PDF document""" + self.outlines = extract_pdf_outlines(binary if binary else filepath) temp_file = None created_tmp_dir = False diff --git a/deepdoc/parser/utils.py b/deepdoc/parser/utils.py index 528e21faa9..b36af08fa5 100644 --- a/deepdoc/parser/utils.py +++ b/deepdoc/parser/utils.py @@ -14,6 +14,10 @@ # limitations under the License. # +from io import BytesIO + +from pypdf import PdfReader as pdf2_read + from rag.nlp import find_codec @@ -30,3 +34,21 @@ def get_text(fnm: str, binary=None) -> str: break txt += line return txt + + +def extract_pdf_outlines(source): + try: + with pdf2_read(source if isinstance(source, str) else BytesIO(source)) as pdf: + outlines = [] + + def dfs(nodes, depth): + for node in nodes: + if isinstance(node, list): + dfs(node, depth + 1) + else: + outlines.append((node["/Title"], depth, pdf.get_destination_page_number(node) + 1)) + + dfs(pdf.outline, 0) + return outlines + except Exception: + return [] diff --git a/docker/.env b/docker/.env index 6665c08b98..f00fb43af4 100644 --- a/docker/.env +++ b/docker/.env @@ -118,7 +118,7 @@ MYSQL_DBNAME=rag_flow MYSQL_PORT=3306 # The port used to expose the MySQL service to the host machine, # allowing EXTERNAL access to the MySQL database running inside the Docker container. -EXPOSE_MYSQL_PORT=5455 +EXPOSE_MYSQL_PORT=3306 # The maximum size of communication packets sent to the MySQL server MYSQL_MAX_PACKET=1073741824 diff --git a/rag/app/manual.py b/rag/app/manual.py index e2af0706f2..7e6eaf2d7e 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -20,6 +20,7 @@ import re from common.constants import ParserType from io import BytesIO +from deepdoc.parser.utils import extract_pdf_outlines from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context, concat_img from common.token_utils import num_tokens_from_string from deepdoc.parser import PdfParser, DocxParser @@ -201,13 +202,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca parser_config["chunk_token_num"] = 0 callback(0.8, "Finish parsing.") + outlines = extract_pdf_outlines(binary if binary is not None else filename) - if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03: - max_lvl = max([lvl for _, lvl in pdf_parser.outlines]) + if len(sections) > 0 and len(outlines) / len(sections) > 0.03: + max_lvl = max([lvl for _, lvl, _ in outlines]) most_level = max(0, max_lvl - 1) levels = [] for txt, _, _ in sections: - for t, lvl in pdf_parser.outlines: + for t, lvl, _ in outlines: tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)]) tks_ = set([txt[i] + txt[i + 1] for i in range(min(len(t), len(txt) - 1))]) if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8: diff --git a/rag/app/paper.py b/rag/app/paper.py index 3f9932eecd..818338d9a5 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -257,6 +257,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0)) if table_ctx or image_ctx: attach_media_context(res, table_ctx, image_ctx) + return res diff --git a/rag/flow/hierarchical_merger/__init__.py b/rag/flow/chunker/__init__.py similarity index 83% rename from rag/flow/hierarchical_merger/__init__.py rename to rag/flow/chunker/__init__.py index b4663378e8..1a080087ba 100644 --- a/rag/flow/hierarchical_merger/__init__.py +++ b/rag/flow/chunker/__init__.py @@ -13,3 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. +from rag.flow.chunker.token_chunker import TokenChunker, TokenChunkerParam + +__all__ = ["TokenChunker", "TokenChunkerParam"] diff --git a/rag/flow/splitter/schema.py b/rag/flow/chunker/schema.py similarity index 97% rename from rag/flow/splitter/schema.py rename to rag/flow/chunker/schema.py index 9875d652ca..223eaf671d 100644 --- a/rag/flow/splitter/schema.py +++ b/rag/flow/chunker/schema.py @@ -17,7 +17,7 @@ from typing import Any, Literal from pydantic import BaseModel, ConfigDict, Field -class SplitterFromUpstream(BaseModel): +class TokenChunkerFromUpstream(BaseModel): created_time: float | None = Field(default=None, alias="_created_time") elapsed_time: float | None = Field(default=None, alias="_elapsed_time") diff --git a/rag/flow/splitter/__init__.py b/rag/flow/chunker/title_chunker/__init__.py similarity index 76% rename from rag/flow/splitter/__init__.py rename to rag/flow/chunker/title_chunker/__init__.py index b4663378e8..989a6f1dca 100644 --- a/rag/flow/splitter/__init__.py +++ b/rag/flow/chunker/title_chunker/__init__.py @@ -13,3 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from rag.flow.chunker.title_chunker.common import TitleChunkerParam +from rag.flow.chunker.title_chunker.title_chunker import TitleChunker + +__all__ = ["TitleChunker", "TitleChunkerParam"] diff --git a/rag/flow/chunker/title_chunker/common.py b/rag/flow/chunker/title_chunker/common.py new file mode 100644 index 0000000000..e3f3a2642a --- /dev/null +++ b/rag/flow/chunker/title_chunker/common.py @@ -0,0 +1,300 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import random +import re +import sys +from abc import ABC, abstractmethod +from collections import Counter +from copy import deepcopy + +from deepdoc.parser.pdf_parser import RAGFlowPdfParser +from deepdoc.parser.utils import extract_pdf_outlines +from rag.flow.base import ProcessBase, ProcessParamBase +from rag.flow.parser.pdf_chunk_metadata import ( + PDF_POSITIONS_KEY, + extract_pdf_positions, + finalize_pdf_chunk, + merge_pdf_positions, + restore_pdf_text_previews, +) +from rag.nlp import not_bullet, not_title + +BODY_LEVEL = sys.maxsize - 1 + + +class TitleChunkerParam(ProcessParamBase): + def __init__(self): + super().__init__() + self.levels = [] + self.hierarchy = None + self.include_heading_content = False + + def check(self): + if self.method in {"hierarchy", "group"}: + self.check_empty(self.levels, "Hierarchical setups.") + if self.method == "hierarchy": + self.check_empty(self.hierarchy, "Hierarchy number.") + + def get_input_form(self) -> dict[str, dict]: + return {} + + +class BaseTitleChunker(ABC): + start_message = "Start to chunk by title." + + def __init__(self, process: ProcessBase, from_upstream): + self.process = process + self.param = process._param + self.from_upstream = from_upstream + + + async def invoke(self): + self.process.set_output("output_format", "chunks") + self.process.callback(random.randint(1, 5) / 100.0, self.start_message) + line_records = self.extract_line_records() + resolved = self.resolve_levels(line_records) + chunks = self.build_chunks(line_records, resolved) + await self.set_chunks(chunks) + self.process.callback(1, "Done.") + + + def extract_line_records(self): + # Normalize all upstream payloads into an ordered record stream. + # Level resolution and chunk construction operate on this stream only, + # so strategy code does not depend on source-specific output layouts. + if self.from_upstream.output_format == "markdown": + payload = self.from_upstream.markdown_result or "" + return [{"text": line, "doc_type_kwd": "text", "img_id": None, "layout": "", PDF_POSITIONS_KEY: []} for line in payload.split("\n") if line] + + if self.from_upstream.output_format == "text": + payload = self.from_upstream.text_result or "" + return [{"text": line, "doc_type_kwd": "text", "img_id": None, "layout": "", PDF_POSITIONS_KEY: []} for line in payload.split("\n") if line] + + if self.from_upstream.output_format == "html": + payload = self.from_upstream.html_result or "" + return [{"text": line, "doc_type_kwd": "text", "img_id": None, "layout": "", PDF_POSITIONS_KEY: []} for line in payload.split("\n") if line] + + items = self.from_upstream.chunks if self.from_upstream.output_format == "chunks" else self.from_upstream.json_result + return [ + { + "text": str(item.get("text") or ""), + "doc_type_kwd": str(item.get("doc_type_kwd") or "text"), + "img_id": item.get("img_id"), + "layout": "{} {}".format(item.get("layout_type", ""), item.get("layoutno", "")).strip(), + PDF_POSITIONS_KEY: extract_pdf_positions(item), + } + for item in items or [] + ] + + + def extract_outlines(self): + file = self.from_upstream.file or {} + source = ( + file.get("blob") + or file.get("binary") + or file.get("path") + or file.get("name") + ) + if not source: + return [] + return extract_pdf_outlines(source) + + + @staticmethod + def match_regex_level(text, level_group): + stripped = text.strip() + for level, pattern in enumerate(level_group, start=1): + if re.match(pattern, stripped) and not not_bullet(stripped): + return level + return None + + + @staticmethod + def select_level_group(lines, raw_levels): + if not raw_levels: + return [] + + # Select one regex family before assigning numeric levels. Mixing + # patterns across families would make the level numbers ambiguous and + # break downstream comparisons. + hits = [0] * len(raw_levels) + for i, group in enumerate(raw_levels): + for sec in lines: + sec = sec.strip() + if not sec: + continue + for pattern in group: + if re.match(pattern, sec) and not not_bullet(sec): + hits[i] += 1 + break + + maximum = 0 + selected = -1 + for i, hit in enumerate(hits): + if hit <= maximum: + continue + selected = i + maximum = hit + + if selected < 0: + return [] + return [pattern for pattern in raw_levels[selected] if pattern] + + + @staticmethod + def match_layout_level(text, layout, fallback_level): + if re.search(r"(section|title|head)", layout, re.I) and not not_title(text.split("@")[0].strip()): + return fallback_level + return BODY_LEVEL + + + @staticmethod + def _outline_similarity(left, right): + left_pairs = {left[i] + left[i + 1] for i in range(len(left) - 1)} + right_pairs = {right[i] + right[i + 1] for i in range(min(len(left), len(right) - 1))} + return len(left_pairs & right_pairs) / max(len(left_pairs), len(right_pairs), 1) + + + def resolve_outline_levels(self, line_records): + outlines = self.extract_outlines() + if not line_records or len(outlines) / len(line_records) <= 0.03: + return None + + max_level = max(level for _, level, _ in outlines) + 1 + levels = [] + for record in line_records: + if record["doc_type_kwd"] != "text": + levels.append(BODY_LEVEL) + continue + text = record["text"] + for outline_text, level, _ in outlines: + if self._outline_similarity(outline_text, text) > 0.8: + levels.append(level + 1) + break + else: + levels.append(BODY_LEVEL) + + return { + "levels": levels, + "most_level": max(1, max_level - 1), + "source": "outline", + } + + + def resolve_frequency_levels(self, line_records): + level_group = self.select_level_group( + [record["text"] for record in line_records], + self.param.levels, + ) + fallback_level = len(level_group) + 1 + levels = [] + for record in line_records: + if record["doc_type_kwd"] != "text": + levels.append(BODY_LEVEL) + continue + level = self.match_regex_level(record["text"], level_group) + if level is not None: + levels.append(level) + continue + levels.append( + self.match_layout_level( + record["text"], + record["layout"], + fallback_level, + ) + ) + + most_level = None + for level, _ in Counter(levels).most_common(): + if level < BODY_LEVEL: + most_level = level + break + + return { + "levels": levels, + "most_level": most_level, + "source": "frequency", + } + + + def resolve_title_levels(self, line_records): + return self.resolve_outline_levels(line_records) or self.resolve_frequency_levels(line_records) + + + def resolve_manual_levels(self, line_records): + return self.resolve_title_levels(line_records)["levels"] + + + def build_chunks_from_record_groups(self, record_groups): + # Strategy code decides record grouping. This method materializes each + # group into the output chunk representation. For PDF-like inputs, the + # chunk box is defined by merged source positions and the text payload + # is normalized by removing parser tags. + if self.from_upstream.output_format in ["markdown", "text", "html"]: + return [ + {"text": "".join(record["text"] + "\n" for record in records)} + for records in record_groups + if records + ] + + return [ + ( + { + "text": RAGFlowPdfParser.remove_tag("".join(record["text"] + "\n" for record in records)), + "doc_type_kwd": "text", + PDF_POSITIONS_KEY: merge_pdf_positions(records), + } + if records[0]["doc_type_kwd"] == "text" + else { + "text": records[0]["text"], + "doc_type_kwd": records[0]["doc_type_kwd"], + "img_id": records[0]["img_id"], + PDF_POSITIONS_KEY: records[0][PDF_POSITIONS_KEY], + } + ) + for records in record_groups + if records + ] + + + async def set_chunks(self, chunks): + if self.from_upstream.output_format in ["markdown", "text", "html"]: + self.process.set_output("chunks", chunks) + return + + # Text grouping runs before visual enrichment. Preview text and final + # box metadata are derived here from the merged PDF positions. + await restore_pdf_text_previews(chunks, self.from_upstream, self.process._canvas) + self.process.set_output("chunks", [finalize_pdf_chunk(deepcopy(chunk)) for chunk in chunks]) + + + @abstractmethod + def resolve_levels(self, line_records): + raise NotImplementedError() + + + @abstractmethod + def build_chunks(self, line_records, resolved): + raise NotImplementedError() + + +def resolve_target_level(levels, hierarchy): + title_levels = sorted({level for level in levels if 0 < level < BODY_LEVEL}) + if not title_levels: + return None + + hierarchy_num = max(int(hierarchy), 1) + return title_levels[min(hierarchy_num, len(title_levels)) - 1] diff --git a/rag/flow/chunker/title_chunker/group_chunker.py b/rag/flow/chunker/title_chunker/group_chunker.py new file mode 100644 index 0000000000..ca43a2d0be --- /dev/null +++ b/rag/flow/chunker/title_chunker/group_chunker.py @@ -0,0 +1,94 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from common.token_utils import num_tokens_from_string +from rag.flow.chunker.title_chunker.common import ( + BaseTitleChunker, + resolve_target_level, +) + +MIN_GROUP_TOKENS = 32 +MAX_GROUP_TOKENS = 1024 + + +def _build_section_ids(levels, target_level): + sec_ids = [] + sid = 0 + for i, level in enumerate(levels): + if target_level is not None and level <= target_level and i > 0: + sid += 1 + sec_ids.append(sid) + return sec_ids + + +def _resolve_group_target_level(levels, hierarchy, most_level): + if hierarchy and int(hierarchy) > 0: + return resolve_target_level(levels, hierarchy) + return most_level + + +class GroupTitleChunker(BaseTitleChunker): + start_message = "Start to group by title levels." + + def resolve_levels(self, line_records): + return self.resolve_title_levels(line_records) + + + def build_chunks(self, line_records, resolved): + target_level = _resolve_group_target_level( + resolved["levels"], + self.param.hierarchy, + resolved["most_level"], + ) + sec_ids = _build_section_ids(resolved["levels"], target_level) + record_groups = [] + tk_cnt = 0 + last_sid = -2 + + # The merge state is driven by (current section id, current token size). + # A chunk stays open while records remain in the same logical section, + # except that very small chunks are allowed to absorb the next record + # regardless of section change. + for record, sec_id in zip(line_records, sec_ids): + if record["doc_type_kwd"] != "text": + record_groups.append([record]) + tk_cnt = 0 + last_sid = -2 + continue + + text = record["text"] + if not text.strip(): + continue + + token_count = num_tokens_from_string(text) + should_merge = ( + record_groups + and record_groups[-1][0]["doc_type_kwd"] == "text" + and ( + tk_cnt < MIN_GROUP_TOKENS + or (tk_cnt < MAX_GROUP_TOKENS and sec_id == last_sid) + ) + ) + + if should_merge: + record_groups[-1].append(record) + tk_cnt += token_count + else: + record_groups.append([record]) + tk_cnt = token_count + + last_sid = sec_id + + return self.build_chunks_from_record_groups(record_groups) diff --git a/rag/flow/chunker/title_chunker/hierarchy_chunker.py b/rag/flow/chunker/title_chunker/hierarchy_chunker.py new file mode 100644 index 0000000000..430bd2240f --- /dev/null +++ b/rag/flow/chunker/title_chunker/hierarchy_chunker.py @@ -0,0 +1,129 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from rag.flow.chunker.title_chunker.common import ( + BaseTitleChunker, + resolve_target_level, +) + + +class _ChunkNode: + def __init__(self, level, title_indexes=None, body_indexes=None): + self.level = level + self.title_indexes = title_indexes or [] + self.body_indexes = body_indexes or [] + self.children = [] + + + def add_child(self, child): + self.children.append(child) + + + def add_body_index(self, index): + self.body_indexes.append(index) + + + def build_tree(self, indexed_lines, depth): + stack = [self] + for level, index in indexed_lines: + if level > depth: + stack[-1].add_body_index(index) + continue + + while len(stack) > 1 and level <= stack[-1].level: + stack.pop() + + node = _ChunkNode(level, title_indexes=[index]) + stack[-1].add_child(node) + stack.append(node) + + return self + + + def get_paths(self, depth, include_heading_content): + chunk_paths = [] + self._dfs(chunk_paths, [], depth, include_heading_content) + return chunk_paths + + + def _dfs(self, chunk_paths, titles, depth, include_heading_content): + if self.level == 0 and self.body_indexes: + chunk_paths.append(titles + self.body_indexes) + + if include_heading_content: + path_titles = titles + self.title_indexes if 1 <= self.level <= depth else titles + + if self.body_indexes and 1 <= self.level <= depth: + chunk_paths.append(path_titles + self.body_indexes) + elif not self.children and 1 <= self.level <= depth: + chunk_paths.append(path_titles) + else: + path_titles = ( + titles + self.title_indexes + self.body_indexes + if 1 <= self.level <= depth + else titles + ) + + if not self.children and 1 <= self.level <= depth: + chunk_paths.append(path_titles) + + for child in self.children: + child._dfs(chunk_paths, path_titles, depth, include_heading_content) + + +class HierarchyTitleChunker(BaseTitleChunker): + start_message = "Start to merge hierarchically." + + def resolve_levels(self, line_records): + return self.resolve_title_levels(line_records) + + + def build_chunks(self, line_records, resolved): + record_groups = [] + text_records = [] + text_levels = [] + + def flush_text_records(): + if not text_records: + return + + target_level = resolve_target_level(text_levels, self.param.hierarchy) + if target_level is None: + record_groups.append(text_records.copy()) + else: + root = _ChunkNode(0) + root.build_tree(list(zip(text_levels, range(len(text_records)))), target_level) + record_groups.extend( + [text_records[index] for index in path] + for path in root.get_paths( + target_level, + self.param.include_heading_content, + ) + if path + ) + text_records.clear() + text_levels.clear() + + for record, level in zip(line_records, resolved["levels"]): + if record["doc_type_kwd"] == "text": + text_records.append(record) + text_levels.append(level) + continue + + flush_text_records() + record_groups.append([record]) + + flush_text_records() + return self.build_chunks_from_record_groups(record_groups) diff --git a/rag/flow/hierarchical_merger/schema.py b/rag/flow/chunker/title_chunker/schema.py similarity index 96% rename from rag/flow/hierarchical_merger/schema.py rename to rag/flow/chunker/title_chunker/schema.py index 65c2ffaa08..60eac5c0a9 100644 --- a/rag/flow/hierarchical_merger/schema.py +++ b/rag/flow/chunker/title_chunker/schema.py @@ -17,7 +17,7 @@ from typing import Any, Literal from pydantic import BaseModel, ConfigDict, Field -class HierarchicalMergerFromUpstream(BaseModel): +class TitleChunkerFromUpstream(BaseModel): created_time: float | None = Field(default=None, alias="_created_time") elapsed_time: float | None = Field(default=None, alias="_elapsed_time") diff --git a/rag/flow/chunker/title_chunker/title_chunker.py b/rag/flow/chunker/title_chunker/title_chunker.py new file mode 100644 index 0000000000..7fc005b1df --- /dev/null +++ b/rag/flow/chunker/title_chunker/title_chunker.py @@ -0,0 +1,39 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from rag.flow.base import ProcessBase +from rag.flow.chunker.title_chunker.group_chunker import GroupTitleChunker +from rag.flow.chunker.title_chunker.hierarchy_chunker import HierarchyTitleChunker +from rag.flow.chunker.title_chunker.schema import TitleChunkerFromUpstream + +class TitleChunker(ProcessBase): + component_name = "TitleChunker" + + async def _invoke(self, **kwargs): + try: + from_upstream = TitleChunkerFromUpstream.model_validate(kwargs) + except Exception as e: + self.set_output("_ERROR", f"Input error: {str(e)}") + return + + if self._param.method == "hierarchy": + await HierarchyTitleChunker(self, from_upstream).invoke() + return + + if self._param.method == "group": + await GroupTitleChunker(self, from_upstream).invoke() + return + + self.set_output("_ERROR", f"Unsupported TitleChunker method: {self._param.method}") diff --git a/rag/flow/chunker/token_chunker.py b/rag/flow/chunker/token_chunker.py new file mode 100644 index 0000000000..15d7891d19 --- /dev/null +++ b/rag/flow/chunker/token_chunker.py @@ -0,0 +1,350 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import random +import re +from copy import deepcopy + +from common.float_utils import normalize_overlapped_percent +from common.token_utils import num_tokens_from_string +from rag.flow.base import ProcessBase, ProcessParamBase +from rag.flow.chunker.schema import TokenChunkerFromUpstream +from rag.flow.parser.pdf_chunk_metadata import ( + PDF_POSITIONS_KEY, + extract_pdf_positions, + finalize_pdf_chunk, + restore_pdf_text_previews, +) +from rag.nlp import naive_merge + + +class TokenChunkerParam(ProcessParamBase): + def __init__(self): + super().__init__() + self.chunk_token_size = 512 + self.delimiters = ["\n"] + self.overlapped_percent = 0 + self.children_delimiters = [] + self.table_context_size = 0 + self.image_context_size = 0 + + def check(self): + if self.delimiters is None: + self.delimiters = [] + elif isinstance(self.delimiters, str): + self.delimiters = [self.delimiters] + else: + self.delimiters = [d for d in self.delimiters if isinstance(d, str)] + self.delimiters = [d for d in self.delimiters if d] + + if self.children_delimiters is None: + self.children_delimiters = [] + elif isinstance(self.children_delimiters, str): + self.children_delimiters = [self.children_delimiters] + else: + self.children_delimiters = [d for d in self.children_delimiters if isinstance(d, str)] + self.children_delimiters = [d for d in self.children_delimiters if d] + + self.check_positive_integer(self.chunk_token_size, "Chunk token size.") + self.check_decimal_float(self.overlapped_percent, "Overlapped percentage: [0, 1)") + self.check_nonnegative_number(self.table_context_size, "Table context size.") + self.check_nonnegative_number(self.image_context_size, "Image context size.") + + def get_input_form(self) -> dict[str, dict]: + return {} + + +def _compile_delimiter_pattern(delimiters): + # Build the primary delimiter regex from active delimiters wrapped by backticks. + raw_delimiters = "".join(delimiter for delimiter in (delimiters or []) if delimiter) + custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", raw_delimiters)] + if not custom_delimiters: + return "" + return "|".join(re.escape(text) for text in sorted(set(custom_delimiters), key=len, reverse=True)) + + +def _split_text_by_pattern(text, pattern): + # Split text by the compiled delimiter pattern and keep delimiter text in each chunk. + if not pattern: + return [text or ""] + + split_texts = re.split(r"(%s)" % pattern, text or "", flags=re.DOTALL) + chunks = [] + for i in range(0, len(split_texts), 2): + chunk = split_texts[i] + if not chunk: + continue + if i + 1 < len(split_texts): + chunk += split_texts[i + 1] + if chunk.strip(): + chunks.append(chunk) + return chunks + + +def _build_json_chunks(json_result, delimiter_pattern): + # Convert upstream JSON items into internal working chunks. + chunks = [] + for item in json_result: + doc_type = str(item.get("doc_type_kwd") or "").strip().lower() + if doc_type == "table": + ck_type = "table" + elif doc_type == "image": + ck_type = "image" + else: + ck_type = "text" + + text = item.get("text") + if not isinstance(text, str): + text = item.get("content_with_weight") + if not isinstance(text, str): + text = "" + + # Keep PDF coordinates as an internal preview field until the final + # output is assembled. This avoids leaking two public coordinate + # formats downstream. + preview_positions = extract_pdf_positions(item) + img_id = item.get("img_id") + + if ck_type == "text": + text_segments = _split_text_by_pattern(text, delimiter_pattern) if delimiter_pattern else [text] + for segment in text_segments: + if not segment or not segment.strip(): + continue + chunks.append( + { + "text": segment, + "doc_type_kwd": "text", + "ck_type": "text", + PDF_POSITIONS_KEY: deepcopy(preview_positions), + "tk_nums": num_tokens_from_string(segment), + } + ) + continue + + chunks.append( + { + "text": text or "", + "doc_type_kwd": ck_type, + "ck_type": ck_type, + "img_id": img_id, + PDF_POSITIONS_KEY: deepcopy(preview_positions), + "tk_nums": num_tokens_from_string(text or ""), + "context_above": "", + "context_below": "", + } + ) + + return chunks + + +def _take_sentences(text, need_tokens, from_end=False): + # Take text from one side until the target token budget is reached. + split_pat = r"([。!??;!\n]|\. )" + texts = re.split(split_pat, text or "", flags=re.DOTALL) + sentences = [] + for i in range(0, len(texts), 2): + sentences.append(texts[i] + (texts[i + 1] if i + 1 < len(texts) else "")) + iterator = reversed(sentences) if from_end else sentences + collected = "" + for sentence in iterator: + collected = sentence + collected if from_end else collected + sentence + if num_tokens_from_string(collected) >= need_tokens: + break + return collected + + +def _attach_context_to_media_chunks(chunks, table_context_size, image_context_size): + # Add surrounding text to table/image chunks when context windows are enabled. + for i, chunk in enumerate(chunks): + if chunk["ck_type"] not in {"table", "image"}: + continue + + context_size = image_context_size if chunk["ck_type"] == "image" else table_context_size + if context_size <= 0: + continue + + remain_above = context_size + remain_below = context_size + parts_above = [] + parts_below = [] + + prev = i - 1 + while prev >= 0 and remain_above > 0: + prev_chunk = chunks[prev] + if prev_chunk["ck_type"] == "text": + if prev_chunk["tk_nums"] >= remain_above: + parts_above.insert(0, _take_sentences(prev_chunk["text"], remain_above, from_end=True)) + remain_above = 0 + break + parts_above.insert(0, prev_chunk["text"]) + remain_above -= prev_chunk["tk_nums"] + prev -= 1 + + after = i + 1 + while after < len(chunks) and remain_below > 0: + after_chunk = chunks[after] + if after_chunk["ck_type"] == "text": + if after_chunk["tk_nums"] >= remain_below: + parts_below.append(_take_sentences(after_chunk["text"], remain_below)) + remain_below = 0 + break + parts_below.append(after_chunk["text"]) + remain_below -= after_chunk["tk_nums"] + after += 1 + + chunk["context_above"] = "".join(parts_above) + chunk["context_below"] = "".join(parts_below) + + +def _merge_text_chunks_by_token_size(chunks, chunk_token_size, overlapped_percent): + # Merge adjacent text chunks when delimiter-based splitting is not active. + merged = [] + prev_text_idx = -1 + threshold = chunk_token_size * (100 - overlapped_percent) / 100.0 + + for chunk in chunks: + if chunk["ck_type"] != "text": + merged.append(deepcopy(chunk)) + prev_text_idx = -1 + continue + + current = deepcopy(chunk) + should_start_new = prev_text_idx < 0 or merged[prev_text_idx]["tk_nums"] > threshold + if should_start_new: + if prev_text_idx >= 0 and overlapped_percent > 0 and merged[prev_text_idx]["text"]: + overlapped = merged[prev_text_idx]["text"] + overlap_start = int(len(overlapped) * (100 - overlapped_percent) / 100.0) + current["text"] = overlapped[overlap_start:] + current["text"] + current["tk_nums"] = num_tokens_from_string(current["text"]) + merged.append(current) + prev_text_idx = len(merged) - 1 + continue + + if merged[prev_text_idx]["text"] and current["text"]: + merged[prev_text_idx]["text"] += "\n" + current["text"] + else: + merged[prev_text_idx]["text"] += current["text"] + merged[prev_text_idx][PDF_POSITIONS_KEY].extend(current.get(PDF_POSITIONS_KEY) or []) + merged[prev_text_idx]["tk_nums"] += current["tk_nums"] + + return merged + + +def _finalize_json_chunks(chunks): + # Convert internal chunks into the final token chunker output format. + docs = [] + for chunk in chunks: + text = (chunk.get("context_above") or "") + (chunk.get("text") or "") + (chunk.get("context_below") or "") + if not text.strip(): + continue + + # The internal preview coordinates are converted exactly once into the + # indexed fields consumed downstream. + doc = { + "text": text, + "doc_type_kwd": chunk.get("doc_type_kwd", "text"), + } + if chunk.get(PDF_POSITIONS_KEY): + doc[PDF_POSITIONS_KEY] = deepcopy(chunk[PDF_POSITIONS_KEY]) + if chunk.get("mom"): + doc["mom"] = chunk["mom"] + if chunk.get("img_id"): + doc["img_id"] = chunk["img_id"] + docs.append(finalize_pdf_chunk(doc)) + + return docs + + +def _split_chunk_docs_by_children(chunks, pattern): + # Apply the secondary children_delimiters split to text chunks only. + if not pattern: + return chunks + + docs = [] + for chunk in chunks: + if chunk.get("doc_type_kwd", "text") != "text": + docs.append(chunk) + continue + + split_texts = _split_text_by_pattern(chunk.get("text", ""), pattern) + + mom = chunk.get("text", "") + for text in split_texts: + if not text.strip(): + continue + child = deepcopy(chunk) + child["mom"] = mom + child["text"] = text + docs.append(child) + + return docs + +class TokenChunker(ProcessBase): + component_name = "TokenChunker" + + async def _invoke(self, **kwargs): + try: + from_upstream = TokenChunkerFromUpstream.model_validate(kwargs) + except Exception as e: + self.set_output("_ERROR", f"Input error: {str(e)}") + return + + # Build the primary delimiter regex. If no active custom delimiter exists, + # the token chunker falls back to token-size based merging. + delimiter_pattern = _compile_delimiter_pattern(self._param.delimiters) + custom_pattern = "|".join(re.escape(t) for t in sorted(set(self._param.children_delimiters), key=len, reverse=True)) + + self.set_output("output_format", "chunks") + self.callback(random.randint(1, 5) / 100.0, "Start to split into chunks.") + overlapped_percent = normalize_overlapped_percent(self._param.overlapped_percent) + if from_upstream.output_format in ["markdown", "text", "html"]: + payload = getattr(from_upstream, f"{from_upstream.output_format}_result") or "" + cks = _split_text_by_pattern(payload, delimiter_pattern) if delimiter_pattern else naive_merge( + payload, + self._param.chunk_token_size, + "", + overlapped_percent, + ) + if custom_pattern: + docs = [] + for c in cks: + if not c.strip(): + continue + for text in _split_text_by_pattern(c, custom_pattern): + if not text.strip(): + continue + docs.append({"text": text, "mom": c}) + self.set_output("chunks", docs) + else: + self.set_output("chunks", [{"text": c.strip()} for c in cks if c.strip()]) + + self.callback(1, "Done.") + return + + # json + json_result = from_upstream.json_result or [] + # Structured JSON input is normalized first, then optionally enriched with + # media context, and finally merged only when delimiter splitting is inactive. + chunks = _build_json_chunks(json_result, delimiter_pattern) + _attach_context_to_media_chunks(chunks, self._param.table_context_size, self._param.image_context_size) + if not delimiter_pattern: + chunks = _merge_text_chunks_by_token_size(chunks, self._param.chunk_token_size, overlapped_percent) + + if custom_pattern: + chunks = _split_chunk_docs_by_children(chunks, custom_pattern) + + await restore_pdf_text_previews(chunks, from_upstream, self._canvas) + cks = _finalize_json_chunks(chunks) + self.set_output("chunks", cks) + self.callback(1, "Done.") diff --git a/rag/flow/hierarchical_merger/hierarchical_merger.py b/rag/flow/hierarchical_merger/hierarchical_merger.py deleted file mode 100644 index 5ce1621710..0000000000 --- a/rag/flow/hierarchical_merger/hierarchical_merger.py +++ /dev/null @@ -1,205 +0,0 @@ -# -# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import asyncio -import logging -import random -import re -from copy import deepcopy -from functools import partial - -from common.misc_utils import get_uuid -from rag.utils.base64_image import id2image, image2id -from deepdoc.parser.pdf_parser import RAGFlowPdfParser -from rag.flow.base import ProcessBase, ProcessParamBase -from rag.flow.hierarchical_merger.schema import HierarchicalMergerFromUpstream -from rag.nlp import concat_img -from common import settings - - -class HierarchicalMergerParam(ProcessParamBase): - def __init__(self): - super().__init__() - self.levels = [] - self.hierarchy = None - - def check(self): - self.check_empty(self.levels, "Hierarchical setups.") - self.check_empty(self.hierarchy, "Hierarchy number.") - - def get_input_form(self) -> dict[str, dict]: - return {} - - -class HierarchicalMerger(ProcessBase): - component_name = "HierarchicalMerger" - - async def _invoke(self, **kwargs): - try: - from_upstream = HierarchicalMergerFromUpstream.model_validate(kwargs) - except Exception as e: - self.set_output("_ERROR", f"Input error: {str(e)}") - return - - self.set_output("output_format", "chunks") - self.callback(random.randint(1, 5) / 100.0, "Start to merge hierarchically.") - if from_upstream.output_format in ["markdown", "text", "html"]: - if from_upstream.output_format == "markdown": - payload = from_upstream.markdown_result - elif from_upstream.output_format == "text": - payload = from_upstream.text_result - else: # == "html" - payload = from_upstream.html_result - - if not payload: - payload = "" - - lines = [ln for ln in payload.split("\n") if ln] - else: - arr = from_upstream.chunks if from_upstream.output_format == "chunks" else from_upstream.json_result - arr = arr or [] - sections, section_images = [], [] - lines = [] - for o in arr: - if isinstance(o, dict): - raw_text = o.get("text") - position_tag = o.get("position_tag", "") - img_id = o.get("img_id") - else: - raw_text = o - position_tag = "" - img_id = None - - txt = raw_text if isinstance(raw_text, str) else ("" if raw_text is None else str(raw_text)) - lines.append(txt) - sections.append((txt, position_tag)) - section_images.append(img_id) - - matches = [] - for txt in lines: - good = False - for lvl, regs in enumerate(self._param.levels): - for reg in regs: - if re.search(reg, txt): - matches.append(lvl) - good = True - break - if good: - break - if not good: - matches.append(len(self._param.levels)) - assert len(matches) == len(lines), f"{len(matches)} vs. {len(lines)}" - - root = { - "level": -1, - "index": -1, - "texts": [], - "children": [] - } - for i, m in enumerate(matches): - if m == 0: - root["children"].append({ - "level": m, - "index": i, - "texts": [], - "children": [] - }) - elif m == len(self._param.levels): - def dfs(b): - if not b["children"]: - b["texts"].append(i) - else: - dfs(b["children"][-1]) - dfs(root) - else: - def dfs(b): - nonlocal m, i - if not b["children"] or m == b["level"] + 1: - b["children"].append({ - "level": m, - "index": i, - "texts": [], - "children": [] - }) - return - dfs(b["children"][-1]) - - dfs(root) - - all_pathes = [] - def dfs(n, path, depth): - nonlocal all_pathes - if not n["children"] and path: - all_pathes.append(path) - - for nn in n["children"]: - if depth < self._param.hierarchy: - _path = deepcopy(path) - else: - _path = path - _path.extend([nn["index"], *nn["texts"]]) - dfs(nn, _path, depth+1) - - if depth == self._param.hierarchy: - all_pathes.append(_path) - - dfs(root, [], 0) - - if root["texts"]: - all_pathes.insert(0, root["texts"]) - if from_upstream.output_format in ["markdown", "text", "html"]: - cks = [] - for path in all_pathes: - txt = "" - for i in path: - txt += lines[i] + "\n" - cks.append(txt) - - self.set_output("chunks", [{"text": c} for c in cks if c]) - else: - cks = [] - images = [] - for path in all_pathes: - txt = "" - img = None - for i in path: - txt += lines[i] + "\n" - concat_img(img, id2image(section_images[i], partial(settings.STORAGE_IMPL.get, tenant_id=self._canvas._tenant_id))) - cks.append(txt) - images.append(img) - - cks = [ - { - "text": RAGFlowPdfParser.remove_tag(c), - "image": img, - "positions": RAGFlowPdfParser.extract_positions(c), - } - for c, img in zip(cks, images) - ] - tasks = [] - for d in cks: - tasks.append(asyncio.create_task(image2id(d, partial(settings.STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id), get_uuid()))) - try: - await asyncio.gather(*tasks, return_exceptions=False) - except Exception as e: - logging.error(f"Error in image2id: {e}") - for t in tasks: - t.cancel() - await asyncio.gather(*tasks, return_exceptions=True) - raise - - self.set_output("chunks", cks) - - self.callback(1, "Done.") diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index 0803ddef70..bbce185c39 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -30,22 +30,29 @@ from api.db.services.llm_service import LLMBundle from api.db.joint_services.tenant_model_service import get_model_config_by_type_and_name, get_tenant_default_model_by_type from common import settings from common.constants import LLMType -from common.misc_utils import get_uuid -from deepdoc.parser import ExcelParser +from common.misc_utils import get_uuid, thread_pool_exec +from deepdoc.parser import ExcelParser, HtmlParser, TxtParser from deepdoc.parser.docling_parser import DoclingParser from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser from deepdoc.parser.tcadp_parser import TCADPParser from rag.app.naive import Docx from rag.flow.base import ProcessBase, ProcessParamBase +from rag.flow.parser.pdf_chunk_metadata import ( + normalize_pdf_items_metadata, + reorder_multi_column_bboxes, +) from rag.flow.parser.schema import ParserFromUpstream +from rag.flow.parser.utils import ( + enhance_media_sections_with_vision, + extract_word_outlines, + remove_toc, + remove_toc_pdf, + remove_toc_word, +) from rag.llm.cv_model import Base as VLM -from rag.nlp import BULLET_PATTERN, bullets_category, docx_question_level, not_bullet from rag.utils.base64_image import image2id -from common.misc_utils import thread_pool_exec - - class ParserParam(ProcessParamBase): def __init__(self): super().__init__() @@ -77,6 +84,14 @@ class ParserParam(ProcessParamBase): "text", "json", ], + "code": [ + "text", + "json", + ], + "html": [ + "text", + "json", + ], "audio": [ "json", ], @@ -91,6 +106,7 @@ class ParserParam(ProcessParamBase): "pdf": { "parse_method": "deepdoc", # deepdoc/plain_text/tcadp_parser/vlm "lang": "Chinese", + "remove_toc": False, "suffix": [ "pdf", ], @@ -106,6 +122,7 @@ class ParserParam(ProcessParamBase): ], }, "word": { + "remove_toc": False, "suffix": [ "doc", "docx", @@ -114,8 +131,32 @@ class ParserParam(ProcessParamBase): }, "text&markdown": { "suffix": ["md", "markdown", "mdx", "txt"], + "remove_toc": False, "output_format": "json", }, + "code": { + "suffix": [ + "py", + "js", + "java", + "c", + "cpp", + "h", + "php", + "go", + "ts", + "sh", + "cs", + "kt", + "sql", + ], + "output_format": "text", + }, + "html": { + "suffix": ["htm", "html"], + "remove_toc": "false", + "output_format": "text", + }, "slides": { "parse_method": "deepdoc", # deepdoc/tcadp_parser "suffix": [ @@ -215,6 +256,16 @@ class ParserParam(ProcessParamBase): text_output_format = text_config.get("output_format", "") self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text&markdown"]) + code_config = self.setups.get("code", "") + if code_config: + code_output_format = code_config.get("output_format", "") + self.check_valid_value(code_output_format, "Code output format abnormal.", self.allowed_output_format["code"]) + + html_config = self.setups.get("html", "") + if html_config: + html_output_format = html_config.get("output_format", "") + self.check_valid_value(html_output_format, "HTML output format abnormal.", self.allowed_output_format["html"]) + audio_config = self.setups.get("audio", "") if audio_config: self.check_empty(audio_config.get("llm_id"), "Audio VLM") @@ -240,91 +291,18 @@ class ParserParam(ProcessParamBase): class Parser(ProcessBase): component_name = "Parser" - @staticmethod - def _extract_word_title_lines(doc, to_page=100000): - lines = [] - if not doc or not getattr(doc, "paragraphs", None): - return lines - - pn = 0 - bull = bullets_category([p.text for p in doc.paragraphs]) - for p in doc.paragraphs: - if pn > to_page: - break - question_level, p_text = docx_question_level(p, bull) - lines.append((question_level, p_text)) - for run in p.runs: - if "lastRenderedPageBreak" in run._element.xml: - pn += 1 - continue - if "w:br" in run._element.xml and 'type="page"' in run._element.xml: - pn += 1 - return lines - - @staticmethod - def _extract_markdown_title_lines(sections): - lines = [] - if not sections: - return lines - - section_texts = [] - for section in sections: - text = section[0] if isinstance(section, tuple) else section - if not isinstance(text, str): - continue - text = text.strip() - if text: - section_texts.append(text) - - if not section_texts: - return lines - - bull = bullets_category(section_texts) - if bull < 0: - return lines - - bullet_patterns = BULLET_PATTERN[bull] - default_level = len(bullet_patterns) + 1 - for text in section_texts: - level = default_level - for idx, pattern in enumerate(bullet_patterns, start=1): - if re.match(pattern, text) and not not_bullet(text): - level = idx - break - lines.append((level, text)) - return lines - - @staticmethod - def _extract_title_texts(lines): - normalized_lines = [] - level_set = set() - for level, txt in lines or []: - if not isinstance(txt, str): - continue - txt = txt.strip() - if not txt: - continue - normalized_lines.append((level, txt)) - level_set.add(level) - - if not normalized_lines or not level_set: - return set() - - sorted_levels = sorted(level_set) - h2_level = sorted_levels[1] if len(sorted_levels) > 1 else 1 - h2_level = sorted_levels[-2] if h2_level == sorted_levels[-1] and len(sorted_levels) > 2 else h2_level - - return {txt for level, txt in normalized_lines if level <= h2_level} - def _pdf(self, name, blob, **kwargs): + """Parse PDF files into structured boxes or markdown/json output.""" self.callback(random.randint(1, 5) / 100.0, "Start to work on a PDF.") conf = self._param.setups["pdf"] self.set_output("output_format", conf["output_format"]) + pdf_parser = None - abstract_enabled = "abstract" in self._param.setups["pdf"].get("preprocess", []) - author_enabled = "author" in self._param.setups["pdf"].get("preprocess", []) - title_enabled = "title" in self._param.setups["pdf"].get("preprocess", []) + # Optional PDF post-processing flags applied after parsing. + abstract_enabled = "abstract" in conf.get("preprocess", []) + author_enabled = "author" in conf.get("preprocess", []) + # Normalize parser selection and optional provider-specific model name. raw_parse_method = conf.get("parse_method", "") parser_model_name = None parse_method = raw_parse_method @@ -338,11 +316,21 @@ class Parser(ProcessBase): parser_model_name = raw_parse_method.rsplit("@", 1)[0] parse_method = "PaddleOCR" + # DeepDOC returns structured page boxes directly. if parse_method.lower() == "deepdoc": - bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback) + pdf_parser = RAGFlowPdfParser() + bboxes = pdf_parser.parse_into_bboxes(blob, callback=self.callback) + if conf.get("enable_multi_column"): + bboxes = reorder_multi_column_bboxes(pdf_parser, bboxes) + + # Plain text only keeps extracted text lines. elif parse_method.lower() == "plain_text": - lines, _ = PlainParser()(blob) - bboxes = [{"text": t} for t, _ in lines] + pdf_parser = PlainParser() + lines, _ = pdf_parser(blob) + bboxes = [{"text": t, "layout_type": "text"} for t, _ in lines] + + # MinerU/PaddleOCR/Docling/TCADP all return line-like sections that need + # to be converted into the shared bbox-like structure used below. elif parse_method.lower() == "mineru": def resolve_mineru_llm_name(): @@ -375,47 +363,63 @@ class Parser(ProcessBase): filepath=name, binary=blob, callback=self.callback, - parse_method=conf.get("mineru_parse_method", "raw"), + parse_method="pipeline", lang=conf.get("lang", "Chinese"), ) bboxes = [] - for t, poss in lines: + for line in lines or []: + if not isinstance(line, tuple) or len(line) < 3: + continue + + t, layout_type, poss = line[0], line[1], line[2] box = { - "image": pdf_parser.crop(poss, 1), - "positions": [[pos[0][-1], *pos[1:]] for pos in pdf_parser.extract_positions(poss)], "text": t, + "layout_type": layout_type or "text", } + positions = [[pos[0][-1] + 1, *pos[1:]] for pos in pdf_parser.extract_positions(poss)] + if positions: + box["positions"] = positions + image = pdf_parser.crop(poss, 1) + if image is not None: + box["image"] = image bboxes.append(box) + elif parse_method.lower() == "docling": pdf_parser = DoclingParser(docling_server_url=os.environ.get("DOCLING_SERVER_URL", "")) lines, _ = pdf_parser.parse_pdf( filepath=name, binary=blob, callback=self.callback, - parse_method=conf.get("docling_parse_method", "raw"), + parse_method="pipeline", docling_server_url=os.environ.get("DOCLING_SERVER_URL", ""), ) bboxes = [] - for item in lines: - if not isinstance(item, tuple) or not item: + for item in lines or []: + if not isinstance(item, tuple) or len(item) < 3: continue - text = item[0] - poss = item[-1] if len(item) >= 2 else "" + text, layout_type, poss = item[0], item[1], item[2] box = { "text": text, - "image": pdf_parser.crop(poss, 1) if isinstance(poss, str) and poss else None, - "positions": [[pos[0][-1], *pos[1:]] for pos in pdf_parser.extract_positions(poss)] if isinstance(poss, str) and poss else [], + "layout_type": layout_type or "text", } + if isinstance(poss, str) and poss: + positions = [[pos[0][-1] + 1, *pos[1:]] for pos in pdf_parser.extract_positions(poss)] + if positions: + box["positions"] = positions + image = pdf_parser.crop(poss, 1) + if image is not None: + box["image"] = image bboxes.append(box) + elif parse_method.lower() == "tcadp parser": # ADP is a document parsing tool using Tencent Cloud API table_result_type = conf.get("table_result_type", "1") markdown_image_response_type = conf.get("markdown_image_response_type", "1") - tcadp_parser = TCADPParser( + pdf_parser = TCADPParser( table_result_type=table_result_type, markdown_image_response_type=markdown_image_response_type, ) - sections, _ = tcadp_parser.parse_pdf( + sections, _ = pdf_parser.parse_pdf( filepath=name, binary=blob, callback=self.callback, @@ -426,26 +430,25 @@ class Parser(ProcessBase): bboxes = [] for section, position_tag in sections: if position_tag: - # Extract position information from TCADP's position tag - # Format: @@{page_number}\t{x0}\t{x1}\t{top}\t{bottom}## match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag) if match: pn, x0, x1, top, bott = match.groups() bboxes.append( { - "page_number": int(pn.split("-")[0]), # Take the first page number + "page_number": int(pn.split("-")[0]), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": section, + "layout_type": "text", } ) else: - # If no position info, add as text without position - bboxes.append({"text": section}) + bboxes.append({"text": section, "layout_type": "text"}) else: - bboxes.append({"text": section}) + bboxes.append({"text": section, "layout_type": "text"}) + elif parse_method.lower() == "paddleocr": def resolve_paddleocr_llm_name(): @@ -478,54 +481,91 @@ class Parser(ProcessBase): filepath=name, binary=blob, callback=self.callback, - parse_method=conf.get("paddleocr_parse_method", "raw"), + parse_method="pipeline", ) bboxes = [] - for t, poss in lines: - # Get cropped image and positions - cropped_image, positions = pdf_parser.crop(poss, need_position=True) + for line in lines or []: + if not isinstance(line, tuple) or len(line) < 3: + continue + t, layout_type, poss = line[0], line[1], line[2] box = { "text": t, - "image": cropped_image, - "positions": positions, + "layout_type": layout_type or "text", } + positions = [[pos[0][-1] + 1, *pos[1:]] for pos in pdf_parser.extract_positions(poss)] + if positions: + box["positions"] = positions + image = pdf_parser.crop(poss) + if image is not None: + box["image"] = image bboxes.append(box) + + # Vision parser treats each page as a large image block. else: if conf.get("parse_method"): vision_model_config = get_model_config_by_type_and_name(self._canvas._tenant_id, LLMType.IMAGE2TEXT, conf["parse_method"]) else: vision_model_config = get_tenant_default_model_by_type(self._canvas._tenant_id, LLMType.IMAGE2TEXT) vision_model = LLMBundle(self._canvas._tenant_id, vision_model_config, lang=self._param.setups["pdf"].get("lang")) - lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback) + pdf_parser = VisionParser(vision_model=vision_model) + lines, _ = pdf_parser(blob, callback=self.callback) bboxes = [] for t, poss in lines: for pn, x0, x1, top, bott in RAGFlowPdfParser.extract_positions(poss): bboxes.append( { - "page_number": int(pn[0]), + "page_number": int(pn[0]) + 1, "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t, + "layout_type": "text", } ) + # Persist outlines and optionally remove TOC before normalizing metadata. + self.set_output("file", {**kwargs.get("file", {}), "outlines": pdf_parser.outlines}) + if conf.get("remove_toc"): + if not pdf_parser.outlines: + bboxes, _ = remove_toc(bboxes) + elif pdf_parser.outlines[0][2] == 1: + bboxes = remove_toc_pdf(bboxes, pdf_parser.outlines) + else: + first_outline_page = pdf_parser.outlines[0][2] + split_at = len(bboxes) + for i, item in enumerate(bboxes): + if item["page_number"] >= first_outline_page: + split_at = i + break + toc_bboxes, _ = remove_toc(bboxes[:split_at]) + bboxes = toc_bboxes + bboxes[split_at:] + + # Normalize shared bbox fields for downstream consumers. + layout_counters = {} for b in bboxes: - text_val = b.get("text", "") - has_text = isinstance(text_val, str) and text_val.strip() - layout = b.get("layout_type") - if layout == "figure" or (b.get("image") and not has_text): - b["doc_type_kwd"] = "image" - elif layout == "table": + raw_layout = str(b.get("layout_type") or "").strip() + has_layout = bool(raw_layout) + layout = re.sub(r"\s+", " ", raw_layout) if has_layout else "text" + b["layout_type"] = layout + + if not b.get("layoutno"): + seq = layout_counters.get(layout, 0) + layout_counters[layout] = seq + 1 + b["layoutno"] = f"{layout}-{seq}" + + if layout == "table": b["doc_type_kwd"] = "table" - if title_enabled and "title" in str(b.get("layout_type", "").lower()): - b["title"] = True + elif layout == "figure": + b["doc_type_kwd"] = "image" + elif not has_layout and b.get("image") is not None: + b["doc_type_kwd"] = "image" + else: + b["doc_type_kwd"] = "text" - # Get authors + # Mark likely author blocks near the title when enabled. if author_enabled: - def _begin(txt): if not isinstance(txt, str): return False @@ -560,7 +600,7 @@ class Parser(ProcessBase): bboxes[next_idx]["author"] = True break - # Get abstract + # Mark the abstract block when enabled. if abstract_enabled: i = 0 abstract_idx = None @@ -585,7 +625,19 @@ class Parser(ProcessBase): if abstract_idx is not None: bboxes[abstract_idx]["abstract"] = True + print(conf.get("vlm")) + + if conf.get("vlm"): + enhance_media_sections_with_vision( + bboxes, + self._canvas._tenant_id, + conf["vlm"], + callback=self.callback, + ) + + # Emit the requested final PDF output format. if conf.get("output_format") == "json": + normalize_pdf_items_metadata(bboxes) self.set_output("json", bboxes) if conf.get("output_format") == "markdown": mkdn = "" @@ -599,6 +651,7 @@ class Parser(ProcessBase): self.set_output("markdown", mkdn) def _spreadsheet(self, name, blob, **kwargs): + """Parse spreadsheet files and normalize them into html/json/markdown output.""" self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.") conf = self._param.setups["spreadsheet"] self.set_output("output_format", conf["output_format"]) @@ -653,7 +706,7 @@ class Parser(ProcessBase): # Add sections as text for section, position_tag in sections: if section: - result.append({"text": section}) + result.append({"text": section, "doc_type_kwd": "text"}) # Add tables as text for table in tables: if table: @@ -679,38 +732,63 @@ class Parser(ProcessBase): htmls = spreadsheet_parser.html(blob, 1000000000) self.set_output("html", htmls[0]) elif conf.get("output_format") == "json": - self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt]) + self.set_output("json", [{"text": txt, "doc_type_kwd": "text"} for txt in spreadsheet_parser(blob) if txt]) elif conf.get("output_format") == "markdown": self.set_output("markdown", spreadsheet_parser.markdown(blob)) def _word(self, name, blob, **kwargs): + """Parse doc/docx files and optionally remove table-of-contents content.""" self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document") conf = self._param.setups["word"] self.set_output("output_format", conf["output_format"]) docx_parser = Docx() + # Extract heading-based outlines for metadata and TOC removal. + outlines = extract_word_outlines(name, blob) + self.set_output("file", {**kwargs.get("file", {}), "outlines": outlines}) + + # JSON output keeps text/image blocks and appends table HTML as table items. if conf.get("output_format") == "json": main_sections = docx_parser(name, binary=blob) - title_lines = self._extract_word_title_lines(getattr(docx_parser, "doc", None)) - title_texts = self._extract_title_texts(title_lines) + if conf.get("remove_toc"): + main_sections = remove_toc_word(main_sections, outlines) sections = [] - tbls = [] for text, image, html in main_sections: - section = {"text": text, "image": image} - text_key = text.strip() if isinstance(text, str) else "" - if text_key and text_key in title_texts and "title" in self._param.setups["word"].get("preprocess", []): - section["title"] = True - sections.append(section) - tbls.append(((None, html), "")) - - sections.extend([{"text": tb, "image": None, "doc_type_kwd": "table"} for ((_, tb), _) in tbls]) + sections.append( + { + "text": text, + "image": image, + "doc_type_kwd": "image" if image is not None else "text", + } + ) + if html: + sections.append( + { + "text": html, + "image": None, + "doc_type_kwd": "table", + } + ) + if conf.get("vlm"): + enhance_media_sections_with_vision( + sections, + self._canvas._tenant_id, + conf["vlm"], + callback=self.callback, + ) self.set_output("json", sections) + + # Markdown output removes TOC on plain markdown lines before writing back. elif conf.get("output_format") == "markdown": markdown_text = docx_parser.to_markdown(name, binary=blob) + if conf.get("remove_toc"): + markdown_text = "\n".join(remove_toc_word(markdown_text.split("\n"), outlines)) + self.set_output("markdown", markdown_text) def _slides(self, name, blob, **kwargs): + """Parse presentation files into json sections.""" self.callback(random.randint(1, 5) / 100.0, "Start to work on a PowerPoint Document") conf = self._param.setups["slides"] @@ -754,7 +832,7 @@ class Parser(ProcessBase): # Add sections as text for section, position_tag in sections: if section: - result.append({"text": section}) + result.append({"text": section, "doc_type_kwd": "text"}) # Add tables as text for table in tables: if table: @@ -768,7 +846,7 @@ class Parser(ProcessBase): ppt_parser = ppt_parser() txts = ppt_parser(blob, 0, 100000, None) - sections = [{"text": section} for section in txts if section.strip()] + sections = [{"text": section, "doc_type_kwd": "text"} for section in txts if section.strip()] # json assert conf.get("output_format") == "json", "have to be json for ppt" @@ -776,6 +854,7 @@ class Parser(ProcessBase): self.set_output("json", sections) def _markdown(self, name, blob, **kwargs): + """Parse markdown and txt files into text/json sections.""" from functools import reduce from rag.app.naive import Markdown as naive_markdown_parser @@ -793,19 +872,18 @@ class Parser(ProcessBase): delimiter=conf.get("delimiter"), return_section_images=True, ) + if name.lower().endswith(".txt") and conf.get("remove_toc") == "true": + sections, kept_indices = remove_toc(sections) + if section_images: + section_images = [section_images[i] for i in kept_indices if i < len(section_images)] if conf.get("output_format") == "json": json_results = [] - title_lines = self._extract_markdown_title_lines(sections) - title_texts = self._extract_title_texts(title_lines) for idx, (section_text, _) in enumerate(sections): json_result = { "text": section_text, } - text_key = section_text.strip() if isinstance(section_text, str) else "" - if text_key and text_key in title_texts and "title" in self._param.setups["text&markdown"].get("preprocess", []): - json_result["title"] = True images = [] if section_images and len(section_images) > idx and section_images[idx] is not None: @@ -814,14 +892,55 @@ class Parser(ProcessBase): # If multiple images found, combine them using concat_img combined_image = reduce(concat_img, images) if len(images) > 1 else images[0] json_result["image"] = combined_image - + json_result["doc_type_kwd"] = "image" if json_result.get("image") is not None else "text" json_results.append(json_result) + if conf.get("vlm"): + enhance_media_sections_with_vision( + json_results, + self._canvas._tenant_id, + conf["vlm"], + callback=self.callback, + ) self.set_output("json", json_results) else: self.set_output("text", "\n".join([section_text for section_text, _ in sections])) + def _code(self, name, blob, **kwargs): + """Parse source code files as plain text chunks.""" + self.callback(random.randint(1, 5) / 100.0, "Start to work on a code or plain text file.") + conf = self._param.setups["code"] + self.set_output("output_format", conf["output_format"]) + + sections = TxtParser()( + name, + blob, + conf.get("chunk_token_num", 128), + conf.get("delimiter", "\n!?;。;!?"), + ) + if conf.get("output_format") == "json": + self.set_output("json", [{"text": section[0], "doc_type_kwd": "text"} for section in sections if section[0]]) + return + + self.set_output("text", "\n".join([section[0] for section in sections if section[0]])) + + def _html(self, name, blob, **kwargs): + """Parse HTML files into text/json sections.""" + self.callback(random.randint(1, 5) / 100.0, "Start to work on an HTML document.") + conf = self._param.setups["html"] + self.set_output("output_format", conf["output_format"]) + + sections = HtmlParser()(name, blob, int(conf.get("chunk_token_num", 512))) + if conf.get("remove_toc") == "true": + sections, _ = remove_toc(sections) + if conf.get("output_format") == "json": + self.set_output("json", [{"text": section, "doc_type_kwd": "text"} for section in sections if section]) + return + + self.set_output("text", "\n".join([section for section in sections if section])) + def _image(self, name, blob, **kwargs): + """Parse images with OCR or image-to-text models.""" from deepdoc.vision import OCR self.callback(random.randint(1, 5) / 100.0, "Start to work on an image.") @@ -860,6 +979,7 @@ class Parser(ProcessBase): self.set_output("json", json_result) def _audio(self, name, blob, **kwargs): + """Parse audio files with speech-to-text models.""" import os import tempfile @@ -879,6 +999,7 @@ class Parser(ProcessBase): self.set_output("text", txt) def _video(self, name, blob, **kwargs): + """Parse video files with image-to-text models.""" self.callback(random.randint(1, 5) / 100.0, "Start to work on an video.") conf = self._param.setups["video"] @@ -891,6 +1012,7 @@ class Parser(ProcessBase): self.set_output("text", txt) def _email(self, name, blob, **kwargs): + """Parse eml/msg files into structured email content.""" self.callback(random.randint(1, 5) / 100.0, "Start to work on an email.") email_content = {} @@ -970,7 +1092,6 @@ class Parser(ProcessBase): # handle msg file import extract_msg - print("handle a msg file.") msg = extract_msg.Message(blob) # handle header info basic_content = { @@ -1005,6 +1126,7 @@ class Parser(ProcessBase): email_content["attachments"] = attachments if conf["output_format"] == "json": + email_content["doc_type_kwd"] = "text" self.set_output("json", [email_content]) else: content_txt = "" @@ -1027,6 +1149,7 @@ class Parser(ProcessBase): self.set_output("text", content_txt) def _epub(self, name, blob, **kwargs): + """Parse EPUB files into text/json sections.""" from deepdoc.parser import EpubParser self.callback(random.randint(1, 5) / 100.0, "Start to work on an EPUB.") @@ -1037,15 +1160,18 @@ class Parser(ProcessBase): sections = epub_parser(name, binary=blob) if conf.get("output_format") == "json": - json_results = [{"text": s} for s in sections if s] + json_results = [{"text": s, "doc_type_kwd": "text"} for s in sections if s] self.set_output("json", json_results) else: self.set_output("text", "\n".join(s for s in sections if s)) async def _invoke(self, **kwargs): + """Dispatch the current file to the matching parser branch by suffix.""" function_map = { "pdf": self._pdf, "text&markdown": self._markdown, + "code": self._code, + "html": self._html, "spreadsheet": self._spreadsheet, "slides": self._slides, "word": self._word, diff --git a/rag/flow/parser/pdf_chunk_metadata.py b/rag/flow/parser/pdf_chunk_metadata.py new file mode 100644 index 0000000000..175ac3772e --- /dev/null +++ b/rag/flow/parser/pdf_chunk_metadata.py @@ -0,0 +1,348 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import io +import logging +import sys +from copy import deepcopy +from functools import partial + +import numpy as np +import pdfplumber +from PIL import Image + +from api.db.services.file2document_service import File2DocumentService +from api.db.services.file_service import FileService +from common import settings +from common.misc_utils import get_uuid +from deepdoc.parser.pdf_parser import LOCK_KEY_pdfplumber, RAGFlowPdfParser +from rag.utils.base64_image import image2id + +PDF_PREVIEW_GAP = 6 +PDF_PREVIEW_CONTEXT = 120 +PDF_PREVIEW_ZOOM = 3 +PDF_POSITIONS_KEY = "_pdf_positions" +PDF_MULTI_COLUMN_ZOOM = 3 + + +def _extract_raw_positions(item): + positions = item.get(PDF_POSITIONS_KEY) + if isinstance(positions, list): + return deepcopy(positions) + + positions = item.get("positions") + if isinstance(positions, list): + return deepcopy(positions) + + position_tag = item.get("position_tag") + if isinstance(position_tag, str) and position_tag: + return [[pos[0][-1], *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(position_tag)] + + position_int = item.get("position_int") + if isinstance(position_int, list): + return [ + list(pos) + for pos in position_int + if isinstance(pos, (list, tuple)) and len(pos) >= 5 + ] + + if item.get("page_number") is not None and all( + item.get(key) is not None for key in ["x0", "x1", "top", "bottom"] + ): + return [[item["page_number"], item["x0"], item["x1"], item["top"], item["bottom"]]] + + return [] + + +def extract_pdf_positions(item): + # Parser-owned canonical PDF coordinate shape: + # [[page_number, left, right, top, bottom], ...] + if not isinstance(item, dict): + return [] + + positions = _extract_raw_positions(item) + ref_page_number = item.get("page_number") + ref_page_number = int(ref_page_number) if isinstance(ref_page_number, (int, float)) else None + if ref_page_number is not None and ref_page_number <= 0: + ref_page_number += 1 + + normalized_positions = [] + for pos in positions: + if not isinstance(pos, (list, tuple)) or len(pos) < 5: + continue + + page_number = pos[0][-1] if isinstance(pos[0], list) else pos[0] + try: + page_number = int(page_number) + if ref_page_number is not None and page_number == ref_page_number - 1: + page_number = ref_page_number + elif page_number <= 0: + page_number += 1 + + normalized_positions.append( + [page_number, float(pos[1]), float(pos[2]), float(pos[3]), float(pos[4])] + ) + except (TypeError, ValueError): + continue + + return normalized_positions + + +def normalize_pdf_item_metadata(item): + if not isinstance(item, dict): + return item + + positions = extract_pdf_positions(item) + if positions: + item[PDF_POSITIONS_KEY] = positions + else: + item.pop(PDF_POSITIONS_KEY, None) + return item + + +def normalize_pdf_items_metadata(items): + if not isinstance(items, list): + return items + for item in items: + normalize_pdf_item_metadata(item) + return items + + +def reorder_multi_column_bboxes(pdf_parser, bboxes, zoom=PDF_MULTI_COLUMN_ZOOM): + text_boxes = [ + box + for box in bboxes + if box.get("layout_type") == "text" + and all(box.get(key) is not None for key in ["x0", "x1", "page_number"]) + ] + if not text_boxes or not pdf_parser.page_images: + return bboxes + + column_width = np.median([box["x1"] - box["x0"] for box in text_boxes]) + page_width = pdf_parser.page_images[0].size[0] / zoom + if column_width >= page_width / 2: + return bboxes + + return pdf_parser.sort_X_by_page(bboxes, column_width / 2) + + +def merge_pdf_positions(sources): + merged = [] + seen = set() + for source in sources or []: + if isinstance(source, dict): + positions = extract_pdf_positions(source) + elif isinstance(source, list): + positions = source + else: + positions = [] + + for pos in positions: + if not isinstance(pos, (list, tuple)) or len(pos) < 5: + continue + key = tuple(pos[:5]) + if key in seen: + continue + seen.add(key) + merged.append(list(pos[:5])) + + merged.sort(key=lambda item: (item[0], item[3], item[1])) + return merged + + +def build_pdf_position_fields(positions): + position_int = [] + page_num_int = [] + top_int = [] + for pos in positions or []: + if not isinstance(pos, (list, tuple)) or len(pos) < 5: + continue + try: + page_no = int(pos[0]) + left = int(pos[1]) + right = int(pos[2]) + top = int(pos[3]) + bottom = int(pos[4]) + except (TypeError, ValueError): + continue + + position_int.append((page_no, left, right, top, bottom)) + page_num_int.append(page_no) + top_int.append(top) + + return { + "position_int": deepcopy(position_int), + "page_num_int": deepcopy(page_num_int), + "top_int": deepcopy(top_int), + } + + +def finalize_pdf_chunk(chunk): + if not isinstance(chunk, dict): + return chunk + + positions = extract_pdf_positions(chunk) + if positions: + chunk.update(build_pdf_position_fields(positions)) + chunk.pop(PDF_POSITIONS_KEY, None) + return chunk + + +def _fetch_source_blob(from_upstream, canvas): + if canvas._doc_id: + bucket, name = File2DocumentService.get_storage_address(doc_id=canvas._doc_id) + return settings.STORAGE_IMPL.get(bucket, name) + if from_upstream.file: + return FileService.get_blob(from_upstream.file["created_by"], from_upstream.file["id"]) + return None + + +def _load_pdf_page_images(blob, zoom=PDF_PREVIEW_ZOOM): + with sys.modules[LOCK_KEY_pdfplumber]: + with pdfplumber.open(io.BytesIO(blob)) as pdf: + return [ + page.to_image(resolution=72 * zoom, antialias=True).annotated + for page in pdf.pages + ] + + +def _crop_pdf_preview(page_images, positions, zoom=PDF_PREVIEW_ZOOM): + if not page_images or not positions: + return None + + normalized_positions = [] + for pos in sorted(positions, key=lambda item: (item[0], item[3], item[1])): + if len(pos) < 5: + continue + + page_idx = int(pos[0]) - 1 + if not (0 <= page_idx < len(page_images)): + continue + + left, right, top, bottom = map(float, pos[1:5]) + if right <= left or bottom <= top: + continue + normalized_positions.append((page_idx, left, right, top, bottom)) + + if not normalized_positions: + return None + + max_width = max(right - left for _, left, right, _, _ in normalized_positions) + first_page, first_left, _, first_top, _ = normalized_positions[0] + last_page, last_left, _, _, last_bottom = normalized_positions[-1] + def page_height(idx): + return page_images[idx].size[1] / zoom + + crop_positions = [ + ( + [first_page], + first_left, + first_left + max_width, + max(0, first_top - PDF_PREVIEW_CONTEXT), + max(first_top - PDF_PREVIEW_GAP, 0), + ) + ] + crop_positions.extend( + [ + ([page_idx], left, right, top, bottom) + for page_idx, left, right, top, bottom in normalized_positions + ] + ) + crop_positions.append( + ( + [last_page], + last_left, + last_left + max_width, + min(page_height(last_page), last_bottom + PDF_PREVIEW_GAP), + min(page_height(last_page), last_bottom + PDF_PREVIEW_CONTEXT), + ) + ) + + imgs = [] + for idx, (pages, left, right, top, bottom) in enumerate(crop_positions): + page_idx = pages[0] + effective_right = ( + left + max_width if idx in {0, len(crop_positions) - 1} else max(left + 10, right) + ) + imgs.append( + page_images[page_idx].crop( + ( + left * zoom, + top * zoom, + effective_right * zoom, + min(bottom * zoom, page_images[page_idx].size[1]), + ) + ) + ) + + canvas_height = int(sum(img.size[1] for img in imgs) + PDF_PREVIEW_GAP * len(imgs)) + canvas_width = int(max(img.size[0] for img in imgs)) + preview = Image.new("RGB", (canvas_width, canvas_height), (245, 245, 245)) + + height = 0 + for idx, img in enumerate(imgs): + if idx in {0, len(imgs) - 1}: + # Dim the extra context so the highlighted body stays visually distinct. + img = img.convert("RGBA") + overlay = Image.new("RGBA", img.size, (0, 0, 0, 0)) + overlay.putalpha(128) + img = Image.alpha_composite(img, overlay).convert("RGB") + + preview.paste(img, (0, height)) + height += img.size[1] + PDF_PREVIEW_GAP + + return preview + + +async def restore_pdf_text_previews(chunks, from_upstream, canvas): + if not chunks or not str(from_upstream.name).lower().endswith(".pdf"): + return + + text_chunks = [ + chunk + for chunk in chunks + if chunk.get("doc_type_kwd", "text") == "text" and extract_pdf_positions(chunk) + ] + if not text_chunks: + return + + blob = _fetch_source_blob(from_upstream, canvas) + if not blob: + return + + try: + page_images = _load_pdf_page_images(blob) + except Exception as e: + logging.warning(f"Failed to load PDF page images for chunk preview restore: {e}") + return + + preview_cache = {} + storage_put = partial(settings.STORAGE_IMPL.put, tenant_id=canvas._tenant_id) + for chunk in text_chunks: + preview_positions = extract_pdf_positions(chunk) + positions_key = tuple(tuple(pos[:5]) for pos in preview_positions) + if not positions_key: + continue + if positions_key in preview_cache: + chunk["img_id"] = preview_cache[positions_key] + continue + + preview = _crop_pdf_preview(page_images, preview_positions) + if not preview: + continue + + chunk["image"] = preview + await image2id(chunk, storage_put, get_uuid()) + if chunk.get("img_id"): + preview_cache[positions_key] = chunk["img_id"] diff --git a/rag/flow/parser/utils.py b/rag/flow/parser/utils.py new file mode 100644 index 0000000000..b897ab1704 --- /dev/null +++ b/rag/flow/parser/utils.py @@ -0,0 +1,173 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import re +from io import BytesIO + +from docx import Document +from api.db.services.llm_service import LLMBundle +from api.db.joint_services.tenant_model_service import ( + get_model_config_by_type_and_name, + get_tenant_default_model_by_type, +) +from common.constants import LLMType +from deepdoc.parser.figure_parser import VisionFigureParser +from rag.nlp import is_english, random_choices, remove_contents_table + + +def remove_toc(items): + indexed = [(_item_text(item), i) for i, item in enumerate(items)] + remove_contents_table(indexed, eng=_is_english(indexed)) + kept_indices = [i for _, i in indexed] + return [items[i] for i in kept_indices], kept_indices + + +def extract_word_outlines(filename, binary=None): + doc = Document(filename) if binary is None else Document(BytesIO(binary)) + outlines = [] + for paragraph in doc.paragraphs: + text = paragraph.text.strip() + if not text: + continue + style_name = paragraph.style.name if paragraph.style else "" + match = re.search(r"Heading\s*(\d+)", style_name, re.I) + if not match: + continue + outlines.append((text, int(match.group(1)) - 1, None)) + return outlines + + +def remove_toc_pdf(items, outlines): + if not outlines: + return items + + toc_start_page = None + content_start_page = None + for i, (title, level, page_no) in enumerate(outlines): + if re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", title.split("@@")[0].strip().lower()): + toc_start_page = page_no + for next_title, next_level, next_page_no in outlines[i + 1:]: + if next_level != level: + continue + if re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", next_title.split("@@")[0].strip().lower()): + continue + content_start_page = next_page_no + break + break + + if content_start_page: + return [item for item in items if not (toc_start_page <= item["page_number"] < content_start_page)] + return items + + +def remove_toc_word(items, outlines): + if not outlines: + filtered_items, _ = remove_toc(items) + return filtered_items + outline_titles = [title.split("@@")[0].strip().lower() for title, _, _ in outlines if title] + if outline_titles: + indexed = [(_item_text(item), i) for i, item in enumerate(items)] + i = 0 + while i < len(indexed): + if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", indexed[i][0].split("@@")[0].strip().lower()): + i += 1 + continue + indexed.pop(i) + while i < len(indexed): + text = indexed[i][0] + normalized = text.split("@@")[0].strip().lower() + if not normalized: + indexed.pop(i) + continue + if any(normalized.startswith(title) or title.startswith(normalized) for title in outline_titles): + indexed.pop(i) + continue + if re.search(r"(\.{2,}|…{2,}|·{2,}|[ ]{2,})\s*\d+\s*$", text): + indexed.pop(i) + continue + break + break + items = [items[i] for _, i in indexed] + filtered_items, _ = remove_toc(items) + return filtered_items + + +def _item_text(item): + if isinstance(item, str): + return item + if isinstance(item, dict): + return item["text"] + return item[0] + + +def _is_english(indexed): + texts = [text for text, _ in indexed if text] + if not texts: + return False + return is_english(random_choices(texts, k=200)) + + +def enhance_media_sections_with_vision( + sections, + tenant_id, + vlm_conf=None, + callback=None, +): + if not sections or not tenant_id or not vlm_conf: + return sections + + try: + try: + vision_model_config = get_model_config_by_type_and_name( + tenant_id, LLMType.IMAGE2TEXT, vlm_conf["llm_id"] + ) + except Exception: + vision_model_config = get_tenant_default_model_by_type( + tenant_id, LLMType.IMAGE2TEXT + ) + vision_model = LLMBundle(tenant_id, vision_model_config) + except Exception: + return sections + + for item in sections: + if item.get("doc_type_kwd") not in {"image", "table"}: + continue + if item.get("image") is None: + continue + + text = item.get("text") or "" + try: + parsed = VisionFigureParser( + vision_model=vision_model, + figures_data=[((item["image"], [""]), [(0, 0, 0, 0, 0)])], + context_size=0, + )(callback=callback) + except Exception: + continue + + if not parsed: + continue + + # VisionFigureParser returns [((image, text_or_text_list), positions), ...]. + first_result = parsed[0] + # first_result[0] is the (image, parsed_text) tuple. + image_and_text = first_result[0] + # image_and_text[1] is the parsed text content. + parsed_text = str(image_and_text[1] or "").strip() + + if parsed_text: + item["text"] = f"{text}\n{parsed_text}" if text else parsed_text + + return sections diff --git a/rag/flow/splitter/splitter.py b/rag/flow/splitter/splitter.py deleted file mode 100644 index 3099681174..0000000000 --- a/rag/flow/splitter/splitter.py +++ /dev/null @@ -1,173 +0,0 @@ -# -# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import asyncio -import logging -import random -import re -from copy import deepcopy -from functools import partial -from common.misc_utils import get_uuid -from rag.utils.base64_image import id2image, image2id -from deepdoc.parser.pdf_parser import RAGFlowPdfParser -from rag.flow.base import ProcessBase, ProcessParamBase -from rag.flow.splitter.schema import SplitterFromUpstream -from common.float_utils import normalize_overlapped_percent -from rag.nlp import attach_media_context, naive_merge, naive_merge_with_images -from common import settings - - -class SplitterParam(ProcessParamBase): - def __init__(self): - super().__init__() - self.chunk_token_size = 512 - self.delimiters = ["\n"] - self.overlapped_percent = 0 - self.children_delimiters = [] - self.table_context_size = 0 - self.image_context_size = 0 - - def check(self): - self.check_empty(self.delimiters, "Delimiters.") - self.check_positive_integer(self.chunk_token_size, "Chunk token size.") - self.check_decimal_float(self.overlapped_percent, "Overlapped percentage: [0, 1)") - self.check_nonnegative_number(self.table_context_size, "Table context size.") - self.check_nonnegative_number(self.image_context_size, "Image context size.") - - def get_input_form(self) -> dict[str, dict]: - return {} - - -class Splitter(ProcessBase): - component_name = "Splitter" - - async def _invoke(self, **kwargs): - try: - from_upstream = SplitterFromUpstream.model_validate(kwargs) - except Exception as e: - self.set_output("_ERROR", f"Input error: {str(e)}") - return - - deli = "" - for d in self._param.delimiters: - if len(d) > 1: - deli += f"`{d}`" - else: - deli += d - custom_pattern = "|".join(re.escape(t) for t in sorted(set(self._param.children_delimiters), key=len, reverse=True)) - - self.set_output("output_format", "chunks") - self.callback(random.randint(1, 5) / 100.0, "Start to split into chunks.") - overlapped_percent = normalize_overlapped_percent(self._param.overlapped_percent) - if from_upstream.output_format in ["markdown", "text", "html"]: - if from_upstream.output_format == "markdown": - payload = from_upstream.markdown_result - elif from_upstream.output_format == "text": - payload = from_upstream.text_result - else: # == "html" - payload = from_upstream.html_result - - if not payload: - payload = "" - - cks = naive_merge( - payload, - self._param.chunk_token_size, - deli, - overlapped_percent, - ) - if custom_pattern: - docs = [] - for c in cks: - if not c.strip(): - continue - split_sec = re.split(r"(%s)" % custom_pattern, c, flags=re.DOTALL) - if split_sec: - for j in range(0, len(split_sec), 2): - if not split_sec[j].strip(): - continue - docs.append({ - "text": split_sec[j], - "mom": c - }) - else: - docs.append({"text": c}) - self.set_output("chunks", docs) - else: - self.set_output("chunks", [{"text": c.strip()} for c in cks if c.strip()]) - - self.callback(1, "Done.") - return - - # json - json_result = from_upstream.json_result or [] - if self._param.table_context_size or self._param.image_context_size: - for ck in json_result: - if "image" not in ck and ck.get("img_id") and not (isinstance(ck.get("text"), str) and ck.get("text").strip()): - ck["image"] = True - attach_media_context(json_result, self._param.table_context_size, self._param.image_context_size) - for ck in json_result: - if ck.get("image") is True: - del ck["image"] - - sections, section_images = [], [] - for o in json_result: - sections.append((o.get("text", ""), o.get("position_tag", ""))) - section_images.append(id2image(o.get("img_id"), partial(settings.STORAGE_IMPL.get, tenant_id=self._canvas._tenant_id))) - - chunks, images = naive_merge_with_images( - sections, - section_images, - self._param.chunk_token_size, - deli, - overlapped_percent, - ) - cks = [ - { - "text": RAGFlowPdfParser.remove_tag(c), - "image": img, - "positions": [[pos[0][-1], *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)] - } - for c, img in zip(chunks, images) if c.strip() - ] - tasks = [] - for d in cks: - tasks.append(asyncio.create_task(image2id(d, partial(settings.STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id), get_uuid()))) - try: - await asyncio.gather(*tasks, return_exceptions=False) - except Exception as e: - logging.error(f"error when splitting: {e}") - for t in tasks: - t.cancel() - await asyncio.gather(*tasks, return_exceptions=True) - raise - - if custom_pattern: - docs = [] - for c in cks: - split_sec = re.split(r"(%s)" % custom_pattern, c["text"], flags=re.DOTALL) - if split_sec: - c["mom"] = c["text"] - for j in range(0, len(split_sec), 2): - if not split_sec[j].strip(): - continue - cc = deepcopy(c) - cc["text"] = split_sec[j] - docs.append(cc) - else: - docs.append(c) - self.set_output("chunks", docs) - else: - self.set_output("chunks", cks) - self.callback(1, "Done.") diff --git a/rag/flow/tests/dsl_examples/general_pdf_all.json b/rag/flow/tests/dsl_examples/general_pdf_all.json index 40f796af6b..2a05d3b5c5 100644 --- a/rag/flow/tests/dsl_examples/general_pdf_all.json +++ b/rag/flow/tests/dsl_examples/general_pdf_all.json @@ -109,12 +109,12 @@ } } }, - "downstream": ["Splitter:0"], + "downstream": ["TokenChunker:0"], "upstream": ["Begin"] }, - "Splitter:0": { + "TokenChunker:0": { "obj": { - "component_name": "Splitter", + "component_name": "TokenChunker", "params": { "chunk_token_size": 512, "delimiters": ["\n"], @@ -131,9 +131,8 @@ } }, "downstream": [], - "upstream": ["Chunker:0"] + "upstream": ["TokenChunker:0"] } }, "path": [] } - diff --git a/rag/flow/tests/dsl_examples/hierarchical_merger.json b/rag/flow/tests/dsl_examples/title_chunker.json similarity index 87% rename from rag/flow/tests/dsl_examples/hierarchical_merger.json rename to rag/flow/tests/dsl_examples/title_chunker.json index 98df8a937d..e5a3be9f86 100644 --- a/rag/flow/tests/dsl_examples/hierarchical_merger.json +++ b/rag/flow/tests/dsl_examples/title_chunker.json @@ -52,33 +52,32 @@ } } }, - "downstream": ["Splitter:0"], + "downstream": ["TokenChunker:0"], "upstream": ["File"] }, - "Splitter:0": { + "TokenChunker:0": { "obj": { - "component_name": "Splitter", + "component_name": "TokenChunker", "params": { "chunk_token_size": 512, "delimiters": ["\r\n"], "overlapped_percent": 0 } }, - "downstream": ["HierarchicalMerger:0"], + "downstream": ["TitleChunker:0"], "upstream": ["Parser:0"] }, - "HierarchicalMerger:0": { + "TitleChunker:0": { "obj": { - "component_name": "HierarchicalMerger", + "component_name": "TitleChunker", "params": { "levels": [["^#[^#]"], ["^##[^#]"], ["^###[^#]"], ["^####[^#]"]], "hierarchy": 2 } }, "downstream": [], - "upstream": ["Splitter:0"] + "upstream": ["TokenChunker:0"] } }, "path": [] } - diff --git a/rag/flow/tokenizer/tokenizer.py b/rag/flow/tokenizer/tokenizer.py index 0d213c512e..ea2e59aec4 100644 --- a/rag/flow/tokenizer/tokenizer.py +++ b/rag/flow/tokenizer/tokenizer.py @@ -24,6 +24,7 @@ from api.db.services.llm_service import LLMBundle from api.db.joint_services.tenant_model_service import get_tenant_default_model_by_type, get_model_config_by_id, get_model_config_by_type_and_name from common.connection_utils import timeout from rag.flow.base import ProcessBase, ProcessParamBase +from rag.flow.parser.pdf_chunk_metadata import finalize_pdf_chunk from rag.flow.tokenizer.schema import TokenizerFromUpstream from rag.nlp import rag_tokenizer from common import settings @@ -123,6 +124,7 @@ class Tokenizer(ProcessBase): if from_upstream.chunks: chunks = from_upstream.chunks for i, ck in enumerate(chunks): + ck["chunk_order_int"] = i ck["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", from_upstream.name)) ck["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(ck["title_tks"]) if ck.get("questions"): @@ -183,4 +185,6 @@ class Tokenizer(ProcessBase): self.callback(1.0, "Finish embedding.") + chunks = [finalize_pdf_chunk(ck) for ck in chunks] + self.set_output("chunks", chunks) diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index f95cc8266b..b6a02f2713 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -1336,7 +1336,7 @@ def _build_cks(sections, delimiter): # ③ normal text content → accumulate seg += sub_sec else: - # no custom delimiter: emit the text as a single chunk + if text and text.strip(): t = text.strip() cks.append({ diff --git a/rag/nlp/search.py b/rag/nlp/search.py index bca8fb85ba..b68ca86e7e 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -90,7 +90,7 @@ class Dealer: src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int", - "doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd", + "doc_id", "chunk_order_int", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd", "question_kwd", "question_tks", "doc_type_kwd", "available_int", "content_with_weight", "mom_id", PAGERANK_FLD, TAG_FLD, "row_id()"]) kwds = set([]) @@ -99,6 +99,7 @@ class Dealer: q_vec = [] if not qst: if req.get("sort"): + orderBy.asc("chunk_order_int") orderBy.asc("page_num_int") orderBy.asc("top_int") orderBy.desc("create_timestamp_flt") diff --git a/test/testcases/test_http_api/test_session_management/test_session_sdk_routes_unit.py b/test/testcases/test_http_api/test_session_management/test_session_sdk_routes_unit.py index df28d68cbc..dcbe105e37 100644 --- a/test/testcases/test_http_api/test_session_management/test_session_sdk_routes_unit.py +++ b/test/testcases/test_http_api/test_session_management/test_session_sdk_routes_unit.py @@ -469,6 +469,7 @@ def _load_session_module(monkeypatch): agent_pkg = ModuleType("agent") agent_pkg.__path__ = [] agent_canvas_mod = ModuleType("agent.canvas") + agent_dsl_migration_mod = ModuleType("agent.dsl_migration") class _StubCanvas: def __init__(self, *_args, **_kwargs): @@ -489,10 +490,13 @@ def _load_session_module(monkeypatch): def __str__(self): return self._dsl + agent_dsl_migration_mod.normalize_chunker_dsl = lambda dsl: dsl agent_canvas_mod.Canvas = _StubCanvas agent_pkg.canvas = agent_canvas_mod + agent_pkg.dsl_migration = agent_dsl_migration_mod monkeypatch.setitem(sys.modules, "agent", agent_pkg) monkeypatch.setitem(sys.modules, "agent.canvas", agent_canvas_mod) + monkeypatch.setitem(sys.modules, "agent.dsl_migration", agent_dsl_migration_mod) module_path = repo_root / "api" / "apps" / "sdk" / "session.py" spec = importlib.util.spec_from_file_location("test_session_sdk_routes_unit_module", module_path) diff --git a/test/testcases/test_web_api/test_canvas_app/test_canvas_routes_unit.py b/test/testcases/test_web_api/test_canvas_app/test_canvas_routes_unit.py index a97fa4e8c0..de1fb91d37 100644 --- a/test/testcases/test_web_api/test_canvas_app/test_canvas_routes_unit.py +++ b/test/testcases/test_web_api/test_canvas_app/test_canvas_routes_unit.py @@ -442,7 +442,10 @@ def _load_canvas_module(monkeypatch): agent_pkg = ModuleType("agent") agent_pkg.__path__ = [] + agent_dsl_migration_mod = ModuleType("agent.dsl_migration") + agent_dsl_migration_mod.normalize_chunker_dsl = lambda dsl: dsl monkeypatch.setitem(sys.modules, "agent", agent_pkg) + monkeypatch.setitem(sys.modules, "agent.dsl_migration", agent_dsl_migration_mod) agent_component_mod = ModuleType("agent.component") @@ -450,6 +453,7 @@ def _load_canvas_module(monkeypatch): pass agent_component_mod.LLM = _StubLLM + agent_pkg.component = agent_component_mod monkeypatch.setitem(sys.modules, "agent.component", agent_component_mod) agent_canvas_mod = ModuleType("agent.canvas") @@ -479,6 +483,8 @@ def _load_canvas_module(monkeypatch): return "{}" agent_canvas_mod.Canvas = _StubCanvas + agent_pkg.canvas = agent_canvas_mod + agent_pkg.dsl_migration = agent_dsl_migration_mod monkeypatch.setitem(sys.modules, "agent.canvas", agent_canvas_mod) quart_mod = ModuleType("quart") diff --git a/web/src/components/llm-setting-items/llm-form-field.tsx b/web/src/components/llm-setting-items/llm-form-field.tsx index 3199b69d78..1846ef99aa 100644 --- a/web/src/components/llm-setting-items/llm-form-field.tsx +++ b/web/src/components/llm-setting-items/llm-form-field.tsx @@ -9,6 +9,7 @@ export type LLMFormFieldProps = { name?: string; testId?: string; optionTestIdPrefix?: string; + config?: any; }; export const useModelOptions = () => { @@ -26,6 +27,7 @@ export function LLMFormField({ name, testId, optionTestIdPrefix, + config, }: LLMFormFieldProps) { const { t } = useTranslation(); const { modelOptions } = useModelOptions(); @@ -36,6 +38,7 @@ export function LLMFormField({ options={options || modelOptions} testId={testId} optionTestIdPrefix={optionTestIdPrefix} + {...config} > ); diff --git a/web/src/constants/agent.tsx b/web/src/constants/agent.tsx index cdb1df9154..d5755e2e03 100644 --- a/web/src/constants/agent.tsx +++ b/web/src/constants/agent.tsx @@ -70,8 +70,8 @@ export enum DataflowOperator { Note = 'Note', Parser = 'Parser', Tokenizer = 'Tokenizer', - Splitter = 'Splitter', - HierarchicalMerger = 'HierarchicalMerger', + TokenChunker = 'TokenChunker', + TitleChunker = 'TitleChunker', Extractor = 'Extractor', } @@ -117,8 +117,8 @@ export enum Operator { File = 'File', // pipeline Parser = 'Parser', Tokenizer = 'Tokenizer', - Splitter = 'Splitter', - HierarchicalMerger = 'HierarchicalMerger', + TokenChunker = 'TokenChunker', + TitleChunker = 'TitleChunker', Extractor = 'Extractor', Loop = 'Loop', LoopStart = 'LoopItem', diff --git a/web/src/locales/ar.ts b/web/src/locales/ar.ts index 60e6767577..5ad61a7b82 100644 --- a/web/src/locales/ar.ts +++ b/web/src/locales/ar.ts @@ -2063,12 +2063,10 @@ export default { tokenizerRequired: 'الرجاء إضافة عقدة المفهرس أولاً', tokenizerDescription: 'يحول النص إلى بنية البيانات المطلوبة (على سبيل المثال، التضمينات المتجهة لبحث التضمين) اعتمادًا على طريقة البحث المختارة.', - splitter: 'رمز مميز', - splitterDescription: + tokenChunkerDescription: 'قم بتقسيم النص إلى أجزاء حسب طول الرمز المميز باستخدام محددات وتداخلات اختيارية.', - hierarchicalMergerDescription: + titleChunkerDescription: 'قم بتقسيم المستندات إلى أقسام حسب التسلسل الهرمي للعناوين باستخدام قواعد regex للتحكم بشكل أفضل.', - hierarchicalMerger: 'عنوان', extractor: 'محول', extractorDescription: 'استخدم LLM لاستخراج الرؤى المنظمة من أجزاء المستند، مثل الملخصات والتصنيفات وما إلى ذلك.', @@ -2080,6 +2078,8 @@ export default { image: 'صورة', email: 'بريد إلكتروني', 'text&markdown': 'النص والترميز', + code: 'Code', + html: 'HTML', word: 'كلمة', slides: 'PPTX', audio: 'صوتي', diff --git a/web/src/locales/bg.ts b/web/src/locales/bg.ts index 31ed2650f1..b976eeeb44 100644 --- a/web/src/locales/bg.ts +++ b/web/src/locales/bg.ts @@ -2118,12 +2118,10 @@ The above is the content you need to summarize.`, tokenizerRequired: 'Моля, първо добавете възел Индексатор', tokenizerDescription: 'Трансформира текст в необходимата структура от данни (напр. векторни вграждания за Embedding Search) в зависимост от избрания метод за търсене.', - splitter: 'Токен', - splitterDescription: + tokenChunkerDescription: 'Разделя текст на фрагменти по дължина на токени с незадължителни разделители и припокриване.', - hierarchicalMergerDescription: + titleChunkerDescription: 'Разделя документи на секции по йерархия на заглавия с regex правила за по-фин контрол.', - hierarchicalMerger: 'Заглавие', extractor: 'Трансформатор', extractorDescription: 'Използва LLM за извличане на структурирани прозрения от фрагменти на документи — като обобщения, класификации и др.', @@ -2135,6 +2133,8 @@ The above is the content you need to summarize.`, image: 'Изображение', email: 'Имейл', 'text&markdown': 'Текст и маркиране', + code: 'Code', + html: 'HTML', word: 'Word', slides: 'PPTX', audio: 'Аудио', diff --git a/web/src/locales/de.ts b/web/src/locales/de.ts index 746598e48e..3770f18629 100644 --- a/web/src/locales/de.ts +++ b/web/src/locales/de.ts @@ -2176,12 +2176,10 @@ Dieser Prozess aggregiert Variablen aus mehreren Zweigen in eine einzelne Variab tokenizerRequired: 'Bitte fügen Sie zuerst den Indexer-Knoten hinzu', tokenizerDescription: 'Transformiert Text in die erforderliche Datenstruktur (z.B. Vektoreinbettungen für die Embedding-Suche) abhängig von der gewählten Suchmethode.', - splitter: 'Token', - splitterDescription: + tokenChunkerDescription: 'Teilt Text in Chunks nach Token-Länge mit optionalen Trennzeichen und Überlappung.', - hierarchicalMergerDescription: + titleChunkerDescription: 'Teilt Dokumente in Abschnitte nach Titelhierarchie mit Regex-Regeln für feinere Kontrolle.', - hierarchicalMerger: 'Titel', extractor: 'Transformer', extractorDescription: 'Verwendet ein LLM, um strukturierte Erkenntnisse aus Dokument-Chunks zu extrahieren – wie Zusammenfassungen, Klassifizierungen usw.', @@ -2193,6 +2191,8 @@ Dieser Prozess aggregiert Variablen aus mehreren Zweigen in eine einzelne Variab image: 'Bild', email: 'E-Mail', 'text&markdown': 'Text & Markup', + code: 'Code', + html: 'HTML', word: 'Word', slides: 'PPTX', audio: 'Audio', diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts index e2a342b85c..3a1f19f37d 100644 --- a/web/src/locales/en.ts +++ b/web/src/locales/en.ts @@ -2228,12 +2228,12 @@ This process aggregates variables from multiple branches into a single variable tokenizerRequired: 'Please add the Indexer node first', tokenizerDescription: 'Transforms text into the required data structure (e.g., vector embeddings for Embedding Search) depending on the chosen search method.', - splitter: 'Token', - splitterDescription: + tokenChunker: 'Token Chunker', + tokenChunkerDescription: 'Split text into chunks by token length with optional delimiters and overlap.', - hierarchicalMergerDescription: + titleChunkerDescription: 'Split documents into sections by title hierarchy with regex rules for finer control.', - hierarchicalMerger: 'Title', + titleChunker: 'Title Chunker', extractor: 'Transformer', extractorDescription: 'Use an LLM to extract structured insights from document chunks—such as summaries, classifications, etc.', @@ -2245,6 +2245,8 @@ This process aggregates variables from multiple branches into a single variable image: 'Image', email: 'Email', 'text&markdown': 'Text & Markup', + code: 'Code', + html: 'HTML', word: 'Word', slides: 'PPTX', audio: 'Audio', diff --git a/web/src/locales/tr.ts b/web/src/locales/tr.ts index 73890f2367..1bd067cafe 100644 --- a/web/src/locales/tr.ts +++ b/web/src/locales/tr.ts @@ -326,7 +326,7 @@ Prosedürel Bellek: Öğrenilen beceriler, alışkanlıklar ve otomatik prosedü action: 'İşlem', parsingStatus: 'Ayrıştırma durumu', parsingStatusTip: - 'Belge ayrıştırma süresi çeşitli faktörlere bağlıdır. Bilgi Grafiği, RAPTOR, Otomatik Soru Çıkarma veya Otomatik Anahtar Kelime Çıkarma gibi özelliklerin etkinleştirilmesi işlem süresini önemli ölçüde artıracaktır. İlerleme çubuğu durursa, lütfen şu SSS\'ye başvurun: https://ragflow.io/docs/dev/faq#why-does-my-document-parsing-stall-at-under-one-percent.', + "Belge ayrıştırma süresi çeşitli faktörlere bağlıdır. Bilgi Grafiği, RAPTOR, Otomatik Soru Çıkarma veya Otomatik Anahtar Kelime Çıkarma gibi özelliklerin etkinleştirilmesi işlem süresini önemli ölçüde artıracaktır. İlerleme çubuğu durursa, lütfen şu SSS'ye başvurun: https://ragflow.io/docs/dev/faq#why-does-my-document-parsing-stall-at-under-one-percent.", processBeginAt: 'Başlangıç zamanı', processDuration: 'Süre', progressMsg: 'İlerleme', @@ -370,7 +370,7 @@ Prosedürel Bellek: Öğrenilen beceriler, alışkanlıklar ve otomatik prosedü toMessage: 'Bitiş sayfa numarası eksik (hariç)', layoutRecognize: 'PDF ayrıştırıcı', layoutRecognizeTip: - 'PDF düzen analizi için görsel model kullanın. Belge başlıklarını, metin bloklarını, görüntüleri ve tabloları etkili bir şekilde konumlandırır. Naive seçeneği seçilirse yalnızca PDF\'deki düz metin alınır. Bu seçeneğin şu anda YALNIZCA PDF belgeleri için çalıştığını lütfen unutmayın.', + "PDF düzen analizi için görsel model kullanın. Belge başlıklarını, metin bloklarını, görüntüleri ve tabloları etkili bir şekilde konumlandırır. Naive seçeneği seçilirse yalnızca PDF'deki düz metin alınır. Bu seçeneğin şu anda YALNIZCA PDF belgeleri için çalıştığını lütfen unutmayın.", taskPageSize: 'Görev sayfa boyutu', taskPageSizeMessage: 'Lütfen görev sayfa boyutunu girin!', taskPageSizeTip: `Düzen tanıma sırasında bir PDF dosyası parçalara bölünür ve işleme hızını artırmak için paralel olarak işlenir. Bu parametre her parçanın boyutunu ayarlar. Daha büyük parça boyutu, sayfalar arasındaki sürekli metni bölme olasılığını azaltır.`, @@ -529,7 +529,8 @@ Prosedürel Bellek: Öğrenilen beceriler, alışkanlıklar ve otomatik prosedü chunkTokenNumberMessage: 'Metin için parça token sayısı gereklidir', embeddingModelTip: `Dataset tarafından kullanılan varsayılan embedding model. Dataset içinde parçalar varken embedding modelini değiştirirken, sistem uyumluluk kontrolü için rastgele birkaç parça örnekler, yeni embedding modeli ile yeniden gömer ve eski ile yeni vektörler arasındaki kosinüs benzerliğini hesaplar. Değiştirmeye ancak örneklerin ortalama benzerliği ≥ 0.9 olduğunda izin verilir. Aksi takdirde, değiştirmeden önce datasetteki tüm parçaları silmeniz gerekir.`, permissionsTip: `'Takım' olarak ayarlandığında, tüm takım üyeleri bu dataset'i yönetebilir.`, - chunkTokenNumberTip: 'Bir parça oluşturmak için token eşiğini belirler. Bu eşiğin altındaki tokenlı bir bölüm, token sayısı eşiği aşana kadar sonraki bölümlerle birleştirilir ve bu noktada bir parça oluşturulur. Eşik aşılsa bile bir sınırlayıcı ile karşılaşılmadıkça yeni parça oluşturulmaz.', + chunkTokenNumberTip: + 'Bir parça oluşturmak için token eşiğini belirler. Bu eşiğin altındaki tokenlı bir bölüm, token sayısı eşiği aşana kadar sonraki bölümlerle birleştirilir ve bu noktada bir parça oluşturulur. Eşik aşılsa bile bir sınırlayıcı ile karşılaşılmadıkça yeni parça oluşturulmaz.', chunkMethod: 'Parçalama yöntemi', chunkMethodTip: 'Sağdaki ipuçlarına bakın.', upload: 'Yükle', @@ -654,7 +655,7 @@ Prosedürel Bellek: Öğrenilen beceriler, alışkanlıklar ve otomatik prosedü `, useRaptor: 'RAPTOR', useRaptorTip: - 'RAPTOR çok adımlı soru-cevap görevleri için kullanılabilir. Dosyalar sayfasına gidin, Oluştur > RAPTOR\'a tıklayarak etkinleştirin. Ayrıntılar için bkz. https://ragflow.io/docs/dev/enable_raptor.', + "RAPTOR çok adımlı soru-cevap görevleri için kullanılabilir. Dosyalar sayfasına gidin, Oluştur > RAPTOR'a tıklayarak etkinleştirin. Ayrıntılar için bkz. https://ragflow.io/docs/dev/enable_raptor.", prompt: 'İstem', promptTip: 'Görevi tanımlamak, nasıl yanıt vermesi gerektiğini belirtmek ve diğer çeşitli gereksinimleri belirlemek için sistem istemini kullanın. Sistem istemi genellikle LLM için çeşitli veri girdileri olarak hizmet eden anahtarlarla (değişkenlerle) birlikte kullanılır. Kullanacağınız anahtarları göstermek için eğik çizgi `/` veya (x) düğmesini kullanın.', @@ -709,7 +710,7 @@ Bu otomatik etiketleme özelliği, mevcut datasete alanına özgü bilgi katman resolutionTip: `Varlık tekilleştirme anahtarı. Etkinleştirildiğinde LLM benzer varlıkları birleştirir - örneğin '2025' ve '2025 yılı' veya 'BT' ve 'Bilgi Teknolojisi' - daha doğru bir grafik oluşturmak için`, community: 'Topluluk raporları', communityTip: - 'Bir bilgi grafiğinde, topluluk ilişkilerle bağlı varlıkların bir kümesidir. LLM\'in her topluluk için bir özet oluşturmasını sağlayabilirsiniz, bu topluluk raporu olarak bilinir. Daha fazla bilgi için bkz: https://www.microsoft.com/en-us/research/blog/graphrag-improving-global-search-via-dynamic-community-selection/', + "Bir bilgi grafiğinde, topluluk ilişkilerle bağlı varlıkların bir kümesidir. LLM'in her topluluk için bir özet oluşturmasını sağlayabilirsiniz, bu topluluk raporu olarak bilinir. Daha fazla bilgi için bkz: https://www.microsoft.com/en-us/research/blog/graphrag-improving-global-search-via-dynamic-community-selection/", theDocumentBeingParsedCannotBeDeleted: 'Ayrıştırılan belge silinemez', lastWeek: 'geçen haftadan', }, @@ -907,7 +908,8 @@ Bu otomatik etiketleme özelliği, mevcut datasete alanına özgü bilgi katman 'Çok adımlı soru-cevap için alım sırasında belirtilen dataset(lerdeki) bilgi grafikleri kullanılsın mı. Etkinleştirildiğinde varlık, ilişki ve topluluk raporu parçaları üzerinde yinelemeli aramalar içererek alım süresini önemli ölçüde artırır.', keyword: 'Anahtar kelime analizi', keywordTip: `Kullanıcının sorularını analiz etmek, alaka düzeyi hesaplaması sırasında vurgulanacak anahtar kelimeleri çıkarmak için LLM kullanın. Uzun sorgularda iyi çalışır ancak yanıt süresini artırır.`, - languageTip: 'Belirtilen dille cümle yeniden yazmaya izin verir veya seçilmezse en son soruyu varsayılan olarak kullanır.', + languageTip: + 'Belirtilen dille cümle yeniden yazmaya izin verir veya seçilmezse en son soruyu varsayılan olarak kullanır.', avatarHidden: 'Avatarı gizle', locale: 'Yerel ayar', selectLanguage: 'Dil seçin', @@ -1019,7 +1021,8 @@ Bu otomatik etiketleme özelliği, mevcut datasete alanına özgü bilgi katman 'Taranacak Google Drive klasör bağlantıları (virgülle ayrılmış).', gmailPrimaryAdminTip: 'Gmail / Workspace erişimi olan birincil yönetici e-postası, alan kullanıcılarını listeleme ve varsayılan senkronizasyon hesabı olarak kullanılır.', - gmailTokenTip: "Google Console'dan oluşturulan OAuth JSON'ını yükleyin. Yalnızca istemci kimlik bilgilerini içeriyorsa, uzun ömürlü yenileme tokenları oluşturmak için tarayıcı tabanlı doğrulamayı bir kez çalıştırın.", + gmailTokenTip: + "Google Console'dan oluşturulan OAuth JSON'ını yükleyin. Yalnızca istemci kimlik bilgilerini içeriyorsa, uzun ömürlü yenileme tokenları oluşturmak için tarayıcı tabanlı doğrulamayı bir kez çalıştırın.", dropboxDescription: "Seçilen bir hesaptan dosya ve klasörleri senkronize etmek için Dropbox'ınızı bağlayın.", bitbucketDescription: @@ -1045,12 +1048,13 @@ Bu otomatik etiketleme özelliği, mevcut datasete alanına özgü bilgi katman imapDescription: 'Bilgi alımı için e-postaları senkronize etmek üzere IMAP posta kutunuza bağlanın.', dropboxAccessTokenTip: - 'Dropbox Uygulama Konsolunda files.metadata.read, files.content.read ve sharing.read kapsamlarıyla uzun ömürlü erişim token\'ı oluşturun.', + "Dropbox Uygulama Konsolunda files.metadata.read, files.content.read ve sharing.read kapsamlarıyla uzun ömürlü erişim token'ı oluşturun.", moodleDescription: "Ders içeriği, forumlar ve kaynakları senkronize etmek için Moodle LMS'nize bağlanın.", - moodleUrlTip: 'Moodle örneğinizin temel URL\'si (örn. https://moodle.university.edu). /webservice veya /login eklemeyin.', + moodleUrlTip: + "Moodle örneğinizin temel URL'si (örn. https://moodle.university.edu). /webservice veya /login eklemeyin.", moodleTokenTip: - 'Moodle\'da bir web servisi token\'ı oluşturun: Site yönetimi → Sunucu → Web servisleri → Tokenları yönet bölümüne gidin. Kullanıcı senkronize etmek istediğiniz kurslara kayıtlı olmalıdır.', + "Moodle'da bir web servisi token'ı oluşturun: Site yönetimi → Sunucu → Web servisleri → Tokenları yönet bölümüne gidin. Kullanıcı senkronize etmek istediğiniz kurslara kayıtlı olmalıdır.", seafileDescription: 'SeaFile sunucunuza bağlanarak kitaplıklarınızdaki dosya ve belgeleri senkronize edin.', seafileUrlTip: "Protokol dahil SeaFile sunucunuzun tam URL'si.", @@ -1069,10 +1073,11 @@ Bu otomatik etiketleme özelliği, mevcut datasete alanına özgü bilgi katman seafileValidationDirectoryPathRequired: 'Dizin Yolu gereklidir', seafileSyncScopeTip: 'Nelerin senkronize edileceğini kontrol eder: ' + - '(1) Tüm Hesap - Token\'larınızın erişebildiği tüm kitaplıkları senkronize eder. Hesap API Token\'ı gerektirir. ' + - '(2) Tek Kitaplık - Belirli bir kitaplıktaki tüm dosyaları senkronize eder. Kitaplık Kimliği ve Hesap API Token\'ı veya Kitaplık API Token\'ı gerektirir. ' + - '(3) Belirli Dizin - Bir kitaplık içindeki belirli bir klasördeki dosyaları senkronize eder. Kitaplık Kimliği, klasör yolu ve kimlik doğrulama token\'ı gerektirir.', - seafileTokenTip: 'Hesap düzeyinde SeaFile API token\'ınız. ' + + "(1) Tüm Hesap - Token'larınızın erişebildiği tüm kitaplıkları senkronize eder. Hesap API Token'ı gerektirir. " + + "(2) Tek Kitaplık - Belirli bir kitaplıktaki tüm dosyaları senkronize eder. Kitaplık Kimliği ve Hesap API Token'ı veya Kitaplık API Token'ı gerektirir. " + + "(3) Belirli Dizin - Bir kitaplık içindeki belirli bir klasördeki dosyaları senkronize eder. Kitaplık Kimliği, klasör yolu ve kimlik doğrulama token'ı gerektirir.", + seafileTokenTip: + "Hesap düzeyinde SeaFile API token'ınız. " + 'Hesabınıza görünür tüm kitaplıklara erişim sağlar. ' + 'Senkronizasyon kapsamı "Tüm Hesap" olduğunda gereklidir. ' + '"Tek Kitaplık" veya "Belirli Dizin" için bu token\'ı veya Kitaplık API Token\'ını kullanabilirsiniz.', @@ -2174,12 +2179,10 @@ Bu otomatik etiketleme özelliği, mevcut datasete alanına özgü bilgi katman tokenizerRequired: 'Lütfen önce İndeksleyici düğümü ekleyin', tokenizerDescription: 'Metni seçilen arama yöntemine bağlı olarak gerekli veri yapısına dönüştürür.', - splitter: 'Token', - splitterDescription: + tokenChunkerDescription: 'Metni isteğe bağlı sınırlayıcılar ve örtüşme ile token uzunluğuna göre parçalara böler.', - hierarchicalMergerDescription: + titleChunkerDescription: 'Belgeleri daha ince kontrol için regex kurallarıyla başlık hiyerarşisine göre bölümlere böler.', - hierarchicalMerger: 'Başlık', extractor: 'Dönüştürücü', extractorDescription: 'Belge parçalarından yapılandırılmış içgörüler çıkarmak için LLM kullanır.', @@ -2191,6 +2194,8 @@ Bu otomatik etiketleme özelliği, mevcut datasete alanına özgü bilgi katman image: 'Görüntü', email: 'E-posta', 'text&markdown': 'Metin ve Biçimlendirme', + code: 'Code', + html: 'HTML', word: 'Word', slides: 'PPTX', audio: 'Ses', diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts index 4f9893995c..17878fed74 100644 --- a/web/src/locales/zh.ts +++ b/web/src/locales/zh.ts @@ -1927,17 +1927,30 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于 tokenizerRequired: '请先添加Tokenizer节点', tokenizerDescription: '根据所选的搜索方法,将文本转换为所需的数据结构(例如,用于嵌入搜索的向量嵌入)。', - splitter: '按字符分割', - splitterDescription: + tokenChunker: '按 Token 分块', + tokenChunkerDescription: '根据分词器长度将文本拆分成块,并带有可选的分隔符和重叠。', - hierarchicalMergerDescription: + titleChunkerDescription: '使用正则表达式规则按标题层次结构将文档拆分成多个部分,以实现更精细的控制。', - hierarchicalMerger: '按标题分割', + titleChunker: '按标题分块', extractor: '提取器', extractorDescription: '使用 LLM 从文档块(例如摘要、分类等)中提取结构化见解。', outputFormat: '输出格式', fileFormats: '文件类型', + fileFormatOptions: { + pdf: 'PDF', + spreadsheet: '表格', + image: '图片', + email: '邮件', + 'text&markdown': '文本与标记', + code: '代码', + html: 'HTML', + word: 'Word', + slides: 'PPTX', + audio: '音频', + video: '视频', + }, fields: '字段', addParser: '增加解析器', hierarchy: '层次结构', diff --git a/web/src/pages/agent/canvas/index.tsx b/web/src/pages/agent/canvas/index.tsx index ecac6a71dd..a7c79faea2 100644 --- a/web/src/pages/agent/canvas/index.tsx +++ b/web/src/pages/agent/canvas/index.tsx @@ -54,6 +54,7 @@ import { RagNode } from './node'; import { AgentNode } from './node/agent-node'; import { BeginNode } from './node/begin-node'; import { CategorizeNode } from './node/categorize-node'; +import { ChunkerNode } from './node/chunker-node'; import { DataOperationsNode } from './node/data-operations-node'; import { NextStepDropdown } from './node/dropdown/next-step-dropdown'; import { ExitLoopNode } from './node/exit-loop-node'; @@ -69,7 +70,6 @@ import ParserNode from './node/parser-node'; import { PlaceholderNode } from './node/placeholder-node'; import { RetrievalNode } from './node/retrieval-node'; import { RewriteNode } from './node/rewrite-node'; -import { SplitterNode } from './node/splitter-node'; import { SwitchNode } from './node/switch-node'; import TokenizerNode from './node/tokenizer-node'; import { ToolNode } from './node/tool-node'; @@ -95,7 +95,7 @@ export const nodeTypes: NodeTypes = { fileNode: FileNode, parserNode: ParserNode, tokenizerNode: TokenizerNode, - splitterNode: SplitterNode, + chunkerNode: ChunkerNode, contextNode: ExtractorNode, dataOperationsNode: DataOperationsNode, listOperationsNode: ListOperationsNode, diff --git a/web/src/pages/agent/canvas/node/splitter-node.tsx b/web/src/pages/agent/canvas/node/chunker-node.tsx similarity index 90% rename from web/src/pages/agent/canvas/node/splitter-node.tsx rename to web/src/pages/agent/canvas/node/chunker-node.tsx index 955764e377..81cd6db08a 100644 --- a/web/src/pages/agent/canvas/node/splitter-node.tsx +++ b/web/src/pages/agent/canvas/node/chunker-node.tsx @@ -11,7 +11,7 @@ import { NodeWrapper } from './node-wrapper'; import { ToolBar } from './toolbar'; type RagNodeProps = NodeProps & PropsWithChildren; -function InnerSplitterNode({ +function InnerChunkerNode({ id, data, isConnectable = true, @@ -47,7 +47,7 @@ function InnerSplitterNode({ id={id} name={'Chunker'} label={data.label} - icon={} + icon={} > {data.name} @@ -55,4 +55,4 @@ function InnerSplitterNode({ ); } -export const SplitterNode = memo(InnerSplitterNode); +export const ChunkerNode = memo(InnerChunkerNode); diff --git a/web/src/pages/agent/canvas/node/dropdown/accordion-operators.tsx b/web/src/pages/agent/canvas/node/dropdown/accordion-operators.tsx index e48d8ee5ee..c80f62e1e1 100644 --- a/web/src/pages/agent/canvas/node/dropdown/accordion-operators.tsx +++ b/web/src/pages/agent/canvas/node/dropdown/accordion-operators.tsx @@ -176,8 +176,8 @@ export function PipelineAccordionOperators({ const chunkerOperators = useMemo(() => { return [ ...restrictSingleOperatorOnCanvas([ - Operator.Splitter, - Operator.HierarchicalMerger, + Operator.TokenChunker, + Operator.TitleChunker, ]), ]; }, [restrictSingleOperatorOnCanvas]); diff --git a/web/src/pages/agent/constant/index.tsx b/web/src/pages/agent/constant/index.tsx index 835b850877..d5cbd1980e 100644 --- a/web/src/pages/agent/constant/index.tsx +++ b/web/src/pages/agent/constant/index.tsx @@ -684,8 +684,8 @@ export const RestrictedUpstreamMap = { [Operator.VariableAssigner]: [Operator.Begin], [Operator.VariableAggregator]: [Operator.Begin], [Operator.Parser]: [Operator.Begin], // pipeline - [Operator.Splitter]: [Operator.Begin], - [Operator.HierarchicalMerger]: [Operator.Begin], + [Operator.TokenChunker]: [Operator.Begin], + [Operator.TitleChunker]: [Operator.Begin], [Operator.Tokenizer]: [Operator.Begin], [Operator.Extractor]: [Operator.Begin], [Operator.File]: [Operator.Begin], @@ -732,8 +732,8 @@ export const NodeMap = { [Operator.File]: 'fileNode', [Operator.Parser]: 'parserNode', [Operator.Tokenizer]: 'tokenizerNode', - [Operator.Splitter]: 'splitterNode', - [Operator.HierarchicalMerger]: 'splitterNode', + [Operator.TokenChunker]: 'chunkerNode', + [Operator.TitleChunker]: 'chunkerNode', [Operator.Extractor]: 'contextNode', [Operator.DataOperations]: 'dataOperationsNode', [Operator.ListOperations]: 'listOperationsNode', @@ -775,8 +775,8 @@ export const NoDebugOperatorsList = [ Operator.File, Operator.Parser, Operator.Tokenizer, - Operator.Splitter, - Operator.HierarchicalMerger, + Operator.TokenChunker, + Operator.TitleChunker, Operator.Extractor, Operator.Tool, ]; @@ -785,8 +785,8 @@ export const NoCopyOperatorsList = [ Operator.File, Operator.Parser, Operator.Tokenizer, - Operator.Splitter, - Operator.HierarchicalMerger, + Operator.TokenChunker, + Operator.TitleChunker, Operator.Extractor, ]; diff --git a/web/src/pages/agent/constant/pipeline.tsx b/web/src/pages/agent/constant/pipeline.tsx index 31cd0cbec2..f5996660e8 100644 --- a/web/src/pages/agent/constant/pipeline.tsx +++ b/web/src/pages/agent/constant/pipeline.tsx @@ -10,6 +10,8 @@ export enum FileType { Image = 'image', Email = 'email', TextMarkdown = 'text&markdown', + Code = 'code', + Html = 'html', Docx = 'word', PowerPoint = 'slides', Video = 'video', @@ -62,6 +64,8 @@ export const OutputFormatMap = { [FileType.Image]: ImageOutputFormat, [FileType.Email]: EmailOutputFormat, [FileType.TextMarkdown]: TextMarkdownOutputFormat, + [FileType.Code]: TextMarkdownOutputFormat, + [FileType.Html]: TextMarkdownOutputFormat, [FileType.Docx]: DocxOutputFormat, [FileType.PowerPoint]: PptOutputFormat, [FileType.Video]: VideoOutputFormat, @@ -74,6 +78,8 @@ export const InitialOutputFormatMap = { [FileType.Image]: ImageOutputFormat.Text, [FileType.Email]: EmailOutputFormat.Text, [FileType.TextMarkdown]: TextMarkdownOutputFormat.Text, + [FileType.Code]: TextMarkdownOutputFormat.Text, + [FileType.Html]: TextMarkdownOutputFormat.Text, [FileType.Docx]: DocxOutputFormat.Json, [FileType.PowerPoint]: PptOutputFormat.Json, [FileType.Video]: VideoOutputFormat.Text, @@ -208,6 +214,16 @@ export const initialParserValues = { output_format: TextMarkdownOutputFormat.Text, preprocess: PreprocessValue.main_content, }, + { + fileFormat: FileType.Code, + output_format: TextMarkdownOutputFormat.Text, + preprocess: PreprocessValue.main_content, + }, + { + fileFormat: FileType.Html, + output_format: TextMarkdownOutputFormat.Text, + preprocess: PreprocessValue.main_content, + }, { fileFormat: FileType.Docx, output_format: DocxOutputFormat.Json, @@ -222,7 +238,7 @@ export const initialParserValues = { ], }; -export const initialSplitterValues = { +export const initialTokenChunkerValues = { outputs: { chunks: { type: 'Array', value: [] }, }, @@ -239,18 +255,74 @@ export enum Hierarchy { H4 = '4', H5 = '5', } - -export const initialHierarchicalMergerValues = { +const rules = [ + { + // levels: [ + // { expression: '^#[^#]' }, + // { expression: '^##[^#]' }, + // { expression: '^###[^#]' }, + // { expression: '^####[^#]' }, + // ], + levels: [ + { expression: '^#[^#]' }, + { expression: '^##[^#]' }, + { expression: '^###[^#]' }, + { expression: '^####[^#]' }, + ], + }, + { + levels: [ + { expression: '第[零一二三四五六七八九十百0-9]+(分?编|部分)' }, + { expression: '第[零一二三四五六七八九十百0-9]+章' }, + { expression: '第[零一二三四五六七八九十百0-9]+节' }, + { expression: '第[零一二三四五六七八九十百0-9]+条' }, + { expression: '[\\((][零一二三四五六七八九十百]+[\\))]' }, + ], + }, + { + levels: [ + { expression: '第[0-9]+章' }, + { expression: '第[0-9]+节' }, + { expression: '[0-9]{1,2}[\\. 、]' }, + { expression: '[0-9]{1,2}\\.[0-9]{1,2}($|[^a-zA-Z/%~.-])' }, + { expression: '[0-9]{1,2}\\.[0-9]{1,2}\\.[0-9]{1,2}' }, + ], + }, + { + levels: [ + { expression: '第[零一二三四五六七八九十百0-9]+章' }, + { expression: '第[零一二三四五六七八九十百0-9]+节' }, + { expression: '[零一二三四五六七八九十百]+[ 、]' }, + { expression: '[\\((][零一二三四五六七八九十百]+[\\))]' }, + { expression: '[\\((][0-9]{,2}[\\))]' }, + ], + }, + { + levels: [ + { + expression: 'PART (ONE|TWO|THREE|FOUR|FIVE|SIX|SEVEN|EIGHT|NINE|TEN)', + }, + { expression: 'Chapter (I+V?|VI*|XI|IX|X)' }, + { expression: 'Section [0-9]+' }, + { expression: 'Article [0-9]+' }, + ], + }, +]; +export const initialTitleChunkerValues = { outputs: { chunks: { type: 'Array', value: [] }, }, + method: 'hierarchy', hierarchy: Hierarchy.H3, - levels: [ - { expressions: [{ expression: '^#[^#]' }] }, - { expressions: [{ expression: '^##[^#]' }] }, - { expressions: [{ expression: '^###[^#]' }] }, - { expressions: [{ expression: '^####[^#]' }] }, - ], + include_heading_content: false, + rules: rules, +}; + +export const initialGroupValues = { + method: 'group', + hierarchy: '0', + include_heading_content: false, + rules: rules, }; export const initialExtractorValues = { @@ -269,6 +341,22 @@ export const FileTypeSuffixMap = { [FileType.Image]: ['jpg', 'jpeg', 'png', 'gif'], [FileType.Email]: ['eml', 'msg'], [FileType.TextMarkdown]: ['md', 'markdown', 'mdx', 'txt'], + [FileType.Code]: [ + 'py', + 'js', + 'java', + 'c', + 'cpp', + 'h', + 'php', + 'go', + 'ts', + 'sh', + 'cs', + 'kt', + 'sql', + ], + [FileType.Html]: ['htm', 'html'], [FileType.Docx]: ['doc', 'docx'], [FileType.PowerPoint]: ['pptx', 'ppt'], [FileType.Video]: ['mp4', 'avi', 'mkv'], @@ -293,7 +381,7 @@ export const FileTypeSuffixMap = { export const SingleOperators = [ Operator.Tokenizer, - Operator.Splitter, - Operator.HierarchicalMerger, + Operator.TokenChunker, + Operator.TitleChunker, Operator.Parser, ]; diff --git a/web/src/pages/agent/form-sheet/form-config-map.tsx b/web/src/pages/agent/form-sheet/form-config-map.tsx index 5ff04ce28c..7fe720885f 100644 --- a/web/src/pages/agent/form-sheet/form-config-map.tsx +++ b/web/src/pages/agent/form-sheet/form-config-map.tsx @@ -14,7 +14,6 @@ import ExtractorForm from '../form/extractor-form'; import GithubForm from '../form/github-form'; import GoogleForm from '../form/google-form'; import GoogleScholarForm from '../form/google-scholar-form'; -import HierarchicalMergerForm from '../form/hierarchical-merger-form'; import InvokeForm from '../form/invoke-form'; import IterationForm from '../form/iteration-form'; import IterationStartForm from '../form/iteration-start-from'; @@ -27,11 +26,12 @@ import PubMedForm from '../form/pubmed-form'; import RetrievalForm from '../form/retrieval-form/next'; import RewriteQuestionForm from '../form/rewrite-question-form'; import SearXNGForm from '../form/searxng-form'; -import SplitterForm from '../form/splitter-form'; import StringTransformForm from '../form/string-transform-form'; import SwitchForm from '../form/switch-form'; import TavilyExtractForm from '../form/tavily-extract-form'; import TavilyForm from '../form/tavily-form'; +import TitleChunkerForm from '../form/title-chunker-form'; +import TokenChunkerForm from '../form/token-chunker-form'; import TokenizerForm from '../form/tokenizer-form'; import ToolForm from '../form/tool-form'; import UserFillUpForm from '../form/user-fill-up-form'; @@ -154,11 +154,11 @@ export const FormConfigMap = { [Operator.Tokenizer]: { component: TokenizerForm, }, - [Operator.Splitter]: { - component: SplitterForm, + [Operator.TokenChunker]: { + component: TokenChunkerForm, }, - [Operator.HierarchicalMerger]: { - component: HierarchicalMergerForm, + [Operator.TitleChunker]: { + component: TitleChunkerForm, }, [Operator.Extractor]: { component: ExtractorForm, diff --git a/web/src/pages/agent/form/hierarchical-merger-form/index.tsx b/web/src/pages/agent/form/hierarchical-merger-form/index.tsx deleted file mode 100644 index 0083b92a4f..0000000000 --- a/web/src/pages/agent/form/hierarchical-merger-form/index.tsx +++ /dev/null @@ -1,191 +0,0 @@ -import { SelectWithSearch } from '@/components/originui/select-with-search'; -import { RAGFlowFormItem } from '@/components/ragflow-form'; -import { BlockButton, Button } from '@/components/ui/button'; -import { Card, CardContent, CardHeader } from '@/components/ui/card'; -import { Form, FormLabel } from '@/components/ui/form'; -import { Input } from '@/components/ui/input'; -import { zodResolver } from '@hookform/resolvers/zod'; -import { Plus, Trash2 } from 'lucide-react'; -import { memo } from 'react'; -import { useFieldArray, useForm, useFormContext } from 'react-hook-form'; -import { useTranslation } from 'react-i18next'; -import { z } from 'zod'; -import { - Hierarchy, - initialHierarchicalMergerValues, -} from '../../constant/pipeline'; -import { useFormValues } from '../../hooks/use-form-values'; -import { useWatchFormChange } from '../../hooks/use-watch-form-change'; -import { INextOperatorForm } from '../../interface'; -import { buildOutputList } from '../../utils/build-output-list'; -import { FormWrapper } from '../components/form-wrapper'; -import { Output } from '../components/output'; - -const outputList = buildOutputList(initialHierarchicalMergerValues.outputs); - -const HierarchyOptions = [ - { label: 'H1', value: Hierarchy.H1 }, - { label: 'H2', value: Hierarchy.H2 }, - { label: 'H3', value: Hierarchy.H3 }, - { label: 'H4', value: Hierarchy.H4 }, - { label: 'H5', value: Hierarchy.H5 }, -]; - -export const FormSchema = z.object({ - hierarchy: z.string(), - levels: z.array( - z.object({ - expressions: z.array( - z.object({ - expression: z.string().refine( - (val) => { - try { - // Try converting the string to a RegExp - new RegExp(val); - return true; - } catch { - return false; - } - }, - { - message: 'Must be a valid regular expression string', - }, - ), - }), - ), - }), - ), -}); - -export type HierarchicalMergerFormSchemaType = z.infer; - -type RegularExpressionsProps = { - index: number; - parentName: string; - removeParent: (index: number) => void; - isLatest: boolean; -}; - -export function RegularExpressions({ - index, - parentName, - isLatest, - removeParent, -}: RegularExpressionsProps) { - const { t } = useTranslation(); - const form = useFormContext(); - - const name = `${parentName}.${index}.expressions`; - - const { fields, append, remove } = useFieldArray({ - name: name, - control: form.control, - }); - - return ( - - - H{index + 1} - {isLatest && ( - - )} - - - - {t('flow.regularExpressions')} - -
- {fields.map((field, index) => ( -
-
- - - -
- {index === 0 ? ( - - ) : ( - - )} -
- ))} -
-
-
- ); -} - -const HierarchicalMergerForm = ({ node }: INextOperatorForm) => { - const { t } = useTranslation(); - const defaultValues = useFormValues(initialHierarchicalMergerValues, node); - - const form = useForm({ - defaultValues, - resolver: zodResolver(FormSchema), - mode: 'onChange', - }); - - const name = 'levels'; - - const { fields, append, remove } = useFieldArray({ - name: name, - control: form.control, - }); - - useWatchFormChange(node?.id, form); - - return ( -
- - - - - {fields.map((field, index) => ( -
-
- -
-
- ))} - {fields.length < 5 && ( - append({ expressions: [{ expression: '' }] })} - > - {t('common.add')} - - )} -
-
- -
-
- ); -}; - -export default memo(HierarchicalMergerForm); diff --git a/web/src/pages/agent/form/parser-form/common-form-fields.tsx b/web/src/pages/agent/form/parser-form/common-form-fields.tsx index d26e518199..3d75306306 100644 --- a/web/src/pages/agent/form/parser-form/common-form-fields.tsx +++ b/web/src/pages/agent/form/parser-form/common-form-fields.tsx @@ -9,6 +9,7 @@ import { SelectWithSearchFlagOptionType, } from '@/components/originui/select-with-search'; import { RAGFlowFormItem } from '@/components/ragflow-form'; +import { Checkbox } from '@/components/ui/checkbox'; import { upperCase, upperFirst } from 'lodash'; import { useTranslation } from 'react-i18next'; import { @@ -80,12 +81,55 @@ export function LargeModelFormField({ }: CommonProps & Pick) { return ( ); } +export function TwoColumnCheckFormField({ prefix }: CommonProps) { + const { t } = useTranslation(); + return ( + + {(field) => ( + { + field.onChange?.(checked); + }} + /> + )} + + ); +} + +export function RmdirFormField({ prefix }: CommonProps) { + const { t } = useTranslation(); + return ( + + {(field) => ( + { + field.onChange?.(checked); + }} + /> + )} + + ); +} + export function LanguageFormField({ prefix }: CommonProps) { const { t } = useTranslation(); diff --git a/web/src/pages/agent/form/parser-form/index.tsx b/web/src/pages/agent/form/parser-form/index.tsx index a1808db4b6..d3f78b1cd7 100644 --- a/web/src/pages/agent/form/parser-form/index.tsx +++ b/web/src/pages/agent/form/parser-form/index.tsx @@ -39,8 +39,13 @@ import { ImageFormFields } from './image-form-fields'; import { PdfFormFields } from './pdf-form-fields'; import { PptFormFields } from './ppt-form-fields'; import { SpreadsheetFormFields } from './spreadsheet-form-fields'; +import { + HtmlFormFields, + TextMarkdownFormFields, +} from './text-html-form-fields'; import { buildFieldNameWithPrefix } from './utils'; import { AudioFormFields, VideoFormFields } from './video-form-fields'; +import { WordFormFields } from './word-form-fields'; const outputList = buildOutputList(initialParserValues.outputs); @@ -72,6 +77,11 @@ const PreprocessOptionConfigsMap: Partial< { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }, { value: PreprocessValue.section_title }, ], + [FileType.Code]: [{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }], + [FileType.Html]: [ + { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }, + { value: PreprocessValue.section_title }, + ], [FileType.Docx]: [ { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true }, { value: PreprocessValue.section_title }, @@ -116,10 +126,13 @@ const FileFormatWidgetMap = { [FileType.PDF]: PdfFormFields, [FileType.Spreadsheet]: SpreadsheetFormFields, [FileType.PowerPoint]: PptFormFields, + [FileType.Docx]: WordFormFields, [FileType.Video]: VideoFormFields, [FileType.Audio]: AudioFormFields, [FileType.Email]: EmailFormFields, [FileType.Image]: ImageFormFields, + [FileType.TextMarkdown]: TextMarkdownFormFields, + [FileType.Html]: HtmlFormFields, }; type ParserItemProps = { @@ -139,10 +152,12 @@ export const FormSchema = z.object({ parse_method: z.string().optional(), lang: z.string().optional(), fields: z.array(z.string()).optional(), - llm_id: z.string().optional(), + vlm: z.object({ llm_id: z.string().optional() }).optional(), system_prompt: z.string().optional(), table_result_type: z.string().optional(), markdown_image_response_type: z.string().optional(), + enable_multi_column: z.boolean().optional(), + remove_toc: z.boolean().optional(), }), ), }); diff --git a/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx b/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx index 82c976f0f4..33366a9bfc 100644 --- a/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx +++ b/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx @@ -4,11 +4,19 @@ import { SelectWithSearchFlagOptionType, } from '@/components/originui/select-with-search'; import { RAGFlowFormItem } from '@/components/ragflow-form'; +import { LlmModelType } from '@/constants/knowledge'; +import { useComposeLlmOptionsByModelTypes } from '@/hooks/use-llm-request'; import { isEmpty } from 'lodash'; import { useEffect, useMemo } from 'react'; import { useFormContext, useWatch } from 'react-hook-form'; import { useTranslation } from 'react-i18next'; -import { LanguageFormField, ParserMethodFormField } from './common-form-fields'; +import { + LanguageFormField, + LargeModelFormField, + ParserMethodFormField, + RmdirFormField, + TwoColumnCheckFormField, +} from './common-form-fields'; import { CommonProps } from './interface'; import { useSetInitialLanguage } from './use-set-initial-language'; import { buildFieldNameWithPrefix } from './utils'; @@ -28,7 +36,9 @@ export function PdfFormFields({ prefix }: CommonProps) { const form = useFormContext(); const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix); - + const modelOptions = useComposeLlmOptionsByModelTypes([ + LlmModelType.Image2text, + ]); const parseMethod = useWatch({ name: parseMethodName, }); @@ -88,7 +98,14 @@ export function PdfFormFields({ prefix }: CommonProps) { return ( <> + + + + {languageShown && } {tcadpOptionsShown && ( <> diff --git a/web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx b/web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx index 4071509917..091de1d19a 100644 --- a/web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx +++ b/web/src/pages/agent/form/parser-form/spreadsheet-form-fields.tsx @@ -4,11 +4,16 @@ import { SelectWithSearchFlagOptionType, } from '@/components/originui/select-with-search'; import { RAGFlowFormItem } from '@/components/ragflow-form'; +import { LlmModelType } from '@/constants/knowledge'; +import { useComposeLlmOptionsByModelTypes } from '@/hooks/use-llm-request'; import { isEmpty } from 'lodash'; import { useEffect, useMemo } from 'react'; import { useFormContext, useWatch } from 'react-hook-form'; import { useTranslation } from 'react-i18next'; -import { ParserMethodFormField } from './common-form-fields'; +import { + LargeModelFormField, + ParserMethodFormField, +} from './common-form-fields'; import { CommonProps } from './interface'; import { buildFieldNameWithPrefix } from './utils'; @@ -25,6 +30,9 @@ const markdownImageResponseTypeOptions: SelectWithSearchFlagOptionType[] = [ export function SpreadsheetFormFields({ prefix }: CommonProps) { const { t } = useTranslation(); const form = useFormContext(); + const modelOptions = useComposeLlmOptionsByModelTypes([ + LlmModelType.Image2text, + ]); const parseMethodName = buildFieldNameWithPrefix('parse_method', prefix); @@ -89,6 +97,10 @@ export function SpreadsheetFormFields({ prefix }: CommonProps) { prefix={prefix} optionsWithoutLLM={optionsWithoutLLM} > + {tcadpOptionsShown && ( <> + + + + ); +} + +export function HtmlFormFields({ prefix }: CommonProps) { + return ; +} diff --git a/web/src/pages/agent/form/parser-form/word-form-fields.tsx b/web/src/pages/agent/form/parser-form/word-form-fields.tsx new file mode 100644 index 0000000000..1db5783b6f --- /dev/null +++ b/web/src/pages/agent/form/parser-form/word-form-fields.tsx @@ -0,0 +1,24 @@ +import { LlmModelType } from '@/constants/knowledge'; +import { useComposeLlmOptionsByModelTypes } from '@/hooks/use-llm-request'; +import { + LargeModelFormField, + OutputFormatFormFieldProps, + RmdirFormField, +} from './common-form-fields'; + +export function WordFormFields({ prefix }: OutputFormatFormFieldProps) { + const modelOptions = useComposeLlmOptionsByModelTypes([ + LlmModelType.Image2text, + ]); + + return ( + <> + + {/* Multimodal Model */} + + + ); +} diff --git a/web/src/pages/agent/form/title-chunker-form/hook.ts b/web/src/pages/agent/form/title-chunker-form/hook.ts new file mode 100644 index 0000000000..fca7ce9093 --- /dev/null +++ b/web/src/pages/agent/form/title-chunker-form/hook.ts @@ -0,0 +1,197 @@ +import { isEmpty } from 'lodash'; +import { useEffect, useMemo } from 'react'; +import { UseFormReturn, useWatch } from 'react-hook-form'; +import { useTranslation } from 'react-i18next'; +import { TitleChunkerFormSchemaType } from '.'; +import { Hierarchy, initialTitleChunkerValues } from '../../constant/pipeline'; + +// type initialValuesType = typeof initialHierarchicalMergerValues; + +function transformLevelsToRules(levels: any[]) { + if (!Array.isArray(levels)) { + return initialTitleChunkerValues.rules; + } + + return levels + .map((levelGroup) => { + if (Array.isArray(levelGroup)) { + const filteredExpressions = levelGroup.filter( + (expression: string) => expression && expression.trim() !== '', + ); + if (filteredExpressions.length === 0) { + return null; + } + return { + levels: filteredExpressions.map((expression: string) => ({ + expression, + })), + }; + } + return { levels: [{ expression: '' }] }; + }) + .filter((rule) => rule !== null); +} + +function filterEmptyRules(rules: any[]) { + if (!Array.isArray(rules)) { + return []; + } + + return rules + .map((rule) => { + if (!rule || !Array.isArray(rule.levels)) { + return null; + } + const filteredLevels = rule.levels.filter( + (level: any) => level.expression && level.expression.trim() !== '', + ); + if (filteredLevels.length === 0) { + return null; + } + return { levels: filteredLevels }; + }) + .filter((rule) => rule !== null); +} + +// function isRulesFormatCorrect(rules: any): boolean { +// if (!rules || !Array.isArray(rules)) { +// return false; +// } +// if (rules.length === 0) { +// return false; +// } +// if (!rules[0] || typeof rules[0] !== 'object') { +// return false; +// } +// if (!Array.isArray(rules[0].levels)) { +// return false; +// } +// return true; +// } + +function transformApiResponseToForm( + apiData: Record, +): TitleChunkerFormSchemaType { + if (!apiData) { + return apiData; + } + + if (isEmpty(apiData)) { + return apiData as TitleChunkerFormSchemaType; + } + + const method = apiData.method as 'hierarchy' | 'group'; + + let hierarchy = apiData.hierarchy; + if (typeof hierarchy === 'number') { + hierarchy = String(hierarchy); + } + if (method === 'group' && !hierarchy) { + hierarchy = '0'; + } + + let rules = apiData.rules; + const hasLevelsData = apiData.levels && Array.isArray(apiData.levels); + + if (hasLevelsData) { + rules = transformLevelsToRules(apiData.levels); + } else if (rules && Array.isArray(rules)) { + rules = filterEmptyRules(rules); + } + + // const rulesFormatCorrect = isRulesFormatCorrect(rules); + + // if (method === 'group') { + // if (rulesFormatCorrect) { + // return { + // method, + // hierarchy, + // rules, + // }; + // } + // return { + // method, + // hierarchy, + // rules, + // }; + // } + + // if (rulesFormatCorrect && method === 'hierarchy') { + // return { + // method, + // hierarchy, + // rules, + // }; + // } + + return { + method, + hierarchy, + include_heading_content: Boolean(apiData.include_heading_content), + rules, + }; +} + +type HierarchyOption = { + label: string; + value: string; +}; + +function getDynamicHierarchyOptions(maxLevel: number): HierarchyOption[] { + if (maxLevel < 1) { + maxLevel = 1; + } + return Array.from({ length: maxLevel }, (_, i) => ({ + label: `H${i + 1}`, + value: String(i + 1) as Hierarchy, + })); +} + +function calculateMaxLevelCount( + rules: Array<{ levels: Array<{ expression: string }> }>, +): number { + if (!rules || rules.length === 0) { + return 1; + } + return Math.max(...rules.map((rule) => rule.levels.length), 1); +} + +export function useDynamicHierarchyOptions( + form: UseFormReturn, + name: string, +): HierarchyOption[] { + const { t } = useTranslation(); + const rules = useWatch({ name, control: form?.control }); + const method = useWatch({ name: 'method', control: form?.control }); + const currentHierarchy = form.watch('hierarchy'); + + const hierarchyOptions = useMemo(() => { + const maxLevelCount = calculateMaxLevelCount(rules); + const options = getDynamicHierarchyOptions(maxLevelCount); + + if (method === 'group') { + return [ + { label: t('common.automatic', 'Automatic'), value: '0' }, + ...options, + ]; + } + + return options; + }, [method, rules, t]); + + useEffect(() => { + if (!currentHierarchy || !form) { + return; + } + + const maxOptionValue = hierarchyOptions[hierarchyOptions.length - 1]?.value; + + if (maxOptionValue && currentHierarchy > maxOptionValue) { + form.setValue('hierarchy', maxOptionValue); + } + }, [currentHierarchy, hierarchyOptions, form]); + + return hierarchyOptions; +} + +export { transformApiResponseToForm }; diff --git a/web/src/pages/agent/form/title-chunker-form/index.tsx b/web/src/pages/agent/form/title-chunker-form/index.tsx new file mode 100644 index 0000000000..2a974a9067 --- /dev/null +++ b/web/src/pages/agent/form/title-chunker-form/index.tsx @@ -0,0 +1,376 @@ +import { FormFieldType, RenderField } from '@/components/dynamic-form'; +import { SelectWithSearch } from '@/components/originui/select-with-search'; +import { RAGFlowFormItem } from '@/components/ragflow-form'; +import { BlockButton, Button } from '@/components/ui/button'; +import { Card, CardContent, CardHeader } from '@/components/ui/card'; +import { Checkbox } from '@/components/ui/checkbox'; +import { Form } from '@/components/ui/form'; +import { Input } from '@/components/ui/input'; +import { zodResolver } from '@hookform/resolvers/zod'; +import { Trash2 } from 'lucide-react'; +import { memo, useEffect, useRef } from 'react'; +import { useFieldArray, useForm, useFormContext } from 'react-hook-form'; +import { useTranslation } from 'react-i18next'; +import { z } from 'zod'; +import { + Hierarchy, + initialGroupValues, + initialTitleChunkerValues, +} from '../../constant/pipeline'; +import { useFormValues } from '../../hooks/use-form-values'; +import { useWatchFormChange } from '../../hooks/use-watch-form-change'; +import { INextOperatorForm } from '../../interface'; +import { buildOutputList } from '../../utils/build-output-list'; +import { FormWrapper } from '../components/form-wrapper'; +import { Output } from '../components/output'; +import { transformApiResponseToForm, useDynamicHierarchyOptions } from './hook'; + +type FormModeValues = { + hierarchy?: string; + include_heading_content?: boolean; + rules: Array<{ levels: Array<{ expression: string }> }>; +}; + +const outputList = buildOutputList(initialTitleChunkerValues.outputs); + +const rulesSchema = z.array( + z.object({ + levels: z.array( + z.object({ + expression: z.string().refine( + (val) => { + try { + new RegExp(val); + return true; + } catch { + return false; + } + }, + { + message: 'Must be a valid regular expression string', + }, + ), + }), + ), + }), +); + +export const FormSchema = z.object({ + method: z.enum(['hierarchy', 'group']), + hierarchy: z.string().optional(), + include_heading_content: z.boolean().optional(), + rules: rulesSchema, +}); + +export type TitleChunkerFormSchemaType = z.infer; + +type LevelItemProps = { + index: number; + parentName: string; + removeParent: (index: number) => void; + isLatest: boolean; +}; + +function LevelItem({ + index, + parentName, + isLatest, + removeParent, +}: LevelItemProps) { + const { t } = useTranslation(); + + const name = `${parentName}.${index}.expression`; + + return ( +
+
+ + + +
+ {isLatest && index > 0 && ( + + )} +
+ ); +} + +type CardBodyProps = { + cardIndex: number; + cardName: string; +}; + +function CardBody({ cardName }: CardBodyProps) { + const { t } = useTranslation(); + const form = useFormContext(); + + const levelsName = `${cardName}.levels`; + + const { + fields: levelFields, + append: appendLevel, + remove: removeLevel, + } = useFieldArray({ + name: levelsName, + control: form.control, + }); + + return ( + +
+ {levelFields.map((levelField, levelIndex) => ( + + ))} +
+ + appendLevel({ expression: '' })} + className="mt-4" + > + {t('flow.addLevel', 'Add Level')} + +
+ ); +} + +// type GroupCardBodyProps = { +// cardName: string; +// }; + +// function GroupCardBody({ cardName }: GroupCardBodyProps) { +// const { t } = useTranslation(); +// const form = useFormContext(); + +// const levelsName = `${cardName}.levels`; + +// const { fields: levelFields } = useFieldArray({ +// name: levelsName, +// control: form.control, +// }); + +// return ( +// +//
+// {levelFields.map((levelField, levelIndex) => ( +// +// +// +// ))} +//
+//
+// ); +// } + +const TitleChunkerForm = ({ node }: INextOperatorForm) => { + const { t } = useTranslation(); + const initialValues = useFormValues(initialTitleChunkerValues, node); + + const hierarchyModeValues = useRef(null); + const groupValues = useRef(null); + + const form = useForm({ + defaultValues: transformApiResponseToForm(initialValues), + resolver: zodResolver(FormSchema), + mode: 'onChange', + }); + const isInitialized = useRef(false); + const initialMode = useRef(undefined); + + const method = form.watch('method'); + const name = 'rules'; + const hierarchyOptions = useDynamicHierarchyOptions(form, name); + + useEffect(() => { + if (!isInitialized.current) { + initialMode.current = method; + isInitialized.current = true; + return; + } + + if (method !== initialMode.current) { + const currentMode = initialMode.current; + const hierarchyValue = form.getValues('hierarchy'); + const rulesValue = form.getValues('rules'); + + if (currentMode === 'hierarchy') { + hierarchyModeValues.current = { + hierarchy: hierarchyValue, + include_heading_content: form.getValues('include_heading_content'), + rules: rulesValue, + }; + } else if (currentMode === 'group') { + groupValues.current = { + hierarchy: hierarchyValue, + include_heading_content: form.getValues('include_heading_content'), + rules: rulesValue, + }; + } + + initialMode.current = method; + + if (method === 'group') { + const modeValues = groupValues.current; + form.reset({ + method: 'group', + hierarchy: modeValues?.hierarchy ?? '0', + include_heading_content: false, + rules: modeValues?.rules || initialGroupValues.rules, + }); + } else { + const defaultHierarchy = Hierarchy.H3; + let modeValues: FormModeValues | null = null; + modeValues = hierarchyModeValues.current; + if (modeValues) { + form.reset({ + method: method, + hierarchy: modeValues.hierarchy || defaultHierarchy, + include_heading_content: + modeValues.include_heading_content || false, + rules: modeValues.rules, + }); + } else { + const newModeValues: FormModeValues = { + hierarchy: defaultHierarchy, + include_heading_content: false, + rules: JSON.parse(JSON.stringify(initialTitleChunkerValues.rules)), + }; + + form.reset({ + method: method, + hierarchy: defaultHierarchy, + include_heading_content: newModeValues.include_heading_content, + rules: newModeValues.rules, + }); + } + } + } + }, [method, form]); + + const { fields, append, remove } = useFieldArray({ + name: name, + control: form.control, + }); + + useWatchFormChange(node?.id, form); + + return ( +
+ + + + + + {method === 'hierarchy' && ( + + {(field) => ( + { + field.onChange?.(checked); + }} + /> + )} + + )} + {/* {method === 'group' ? ( + + + + {t('flow.rule', 'Rule')} 1 + + + + + ) : ( */} +
+ {fields.map((cardField, cardIndex) => ( + + +
+ + {t('flow.rule', 'Rule')} {cardIndex + 1} + +
+ {fields.length > 1 && ( + + )} +
+ +
+ ))} +
+ {/* )} */} + {/* {method !== 'group' && ( */} + + append({ + levels: [{ expression: '' }], + }) + } + className="mt-4" + > + {t('flow.rule', 'Add Rule')} + + {/* )} */} +
+
+ +
+
+ ); +}; + +export default memo(TitleChunkerForm); diff --git a/web/src/pages/agent/form/splitter-form/index.tsx b/web/src/pages/agent/form/token-chunker-form/index.tsx similarity index 55% rename from web/src/pages/agent/form/splitter-form/index.tsx rename to web/src/pages/agent/form/token-chunker-form/index.tsx index f4dcb74188..14b4fd567b 100644 --- a/web/src/pages/agent/form/splitter-form/index.tsx +++ b/web/src/pages/agent/form/token-chunker-form/index.tsx @@ -1,4 +1,5 @@ import { DelimiterInput } from '@/components/delimiter-form-field'; +import { FormFieldType, RenderField } from '@/components/dynamic-form'; import { RAGFlowFormItem } from '@/components/ragflow-form'; import { SliderInputFormField } from '@/components/slider-input-form-field'; import { BlockButton, Button } from '@/components/ui/button'; @@ -10,7 +11,7 @@ import { memo } from 'react'; import { useFieldArray, useForm } from 'react-hook-form'; import { useTranslation } from 'react-i18next'; import { z } from 'zod'; -import { initialSplitterValues } from '../../constant/pipeline'; +import { initialTokenChunkerValues } from '../../constant/pipeline'; import { useFormValues } from '../../hooks/use-form-values'; import { useWatchFormChange } from '../../hooks/use-watch-form-change'; import { INextOperatorForm } from '../../interface'; @@ -18,7 +19,7 @@ import { buildOutputList } from '../../utils/build-output-list'; import { FormWrapper } from '../components/form-wrapper'; import { Output } from '../components/output'; -const outputList = buildOutputList(initialSplitterValues.outputs); +const outputList = buildOutputList(initialTokenChunkerValues.outputs); export const FormSchema = z.object({ chunk_token_size: z.number(), @@ -34,19 +35,27 @@ export const FormSchema = z.object({ value: z.string().optional(), }), ), - overlapped_percent: z.number(), // 0.0 - 0.3 , 0% - 30% + overlapped_percent: z.number(), + delimiter_mode: z.enum(['token_size', 'delimiter']).optional(), }); -export type SplitterFormSchemaType = z.infer; +export type TokenChunkerFormSchemaType = z.infer; -const SplitterForm = ({ node }: INextOperatorForm) => { - const defaultValues = useFormValues(initialSplitterValues, node); +const TokenChunkerForm = ({ node }: INextOperatorForm) => { + const defaultValues = useFormValues(initialTokenChunkerValues, node); const { t } = useTranslation(); + const formDefaultValues = { + ...defaultValues, + delimiter_mode: defaultValues.delimiter_mode || 'token_size', + }; + const form = useForm({ - defaultValues, + defaultValues: formDefaultValues, resolver: zodResolver(FormSchema), }); + + const delimiterMode = form.watch('delimiter_mode'); const name = 'delimiters'; const { fields, append, remove } = useFieldArray({ @@ -64,52 +73,73 @@ const SplitterForm = ({ node }: INextOperatorForm) => { return (
- - - -
- {t('flow.delimiters')} -
- {fields.map((field, index) => ( -
-
- - - -
- + + + {delimiterMode === 'token_size' && ( + <> + + + + + )} + + {delimiterMode === 'delimiter' && ( + <> +
+ {t('flow.delimiters')} +
+ {fields.map((field, index) => ( +
+
+ + + +
+ +
+ ))}
- ))} -
-
- append({ value: '\n' })}> - {t('common.add')} - + + append({ value: '\n' })}> + {t('common.add')} + + + )}
@@ -171,4 +201,4 @@ const SplitterForm = ({ node }: INextOperatorForm) => { ); }; -export default memo(SplitterForm); +export default memo(TokenChunkerForm); diff --git a/web/src/pages/agent/hooks/use-add-node.ts b/web/src/pages/agent/hooks/use-add-node.ts index 257307cf4b..3930ab8514 100644 --- a/web/src/pages/agent/hooks/use-add-node.ts +++ b/web/src/pages/agent/hooks/use-add-node.ts @@ -24,7 +24,6 @@ import { initialGithubValues, initialGoogleScholarValues, initialGoogleValues, - initialHierarchicalMergerValues, initialInvokeValues, initialIterationStartValues, initialIterationValues, @@ -38,11 +37,12 @@ import { initialRetrievalValues, initialRewriteQuestionValues, initialSearXNGValues, - initialSplitterValues, initialStringTransformValues, initialSwitchValues, initialTavilyExtractValues, initialTavilyValues, + initialTitleChunkerValues, + initialTokenChunkerValues, initialTokenizerValues, initialUserFillUpValues, initialVariableAggregatorValues, @@ -165,8 +165,8 @@ export const useInitializeOperatorParams = () => { [Operator.File]: {}, [Operator.Parser]: initialParserValues, [Operator.Tokenizer]: initialTokenizerValues, - [Operator.Splitter]: initialSplitterValues, - [Operator.HierarchicalMerger]: initialHierarchicalMergerValues, + [Operator.TokenChunker]: initialTokenChunkerValues, + [Operator.TitleChunker]: initialTitleChunkerValues, [Operator.Extractor]: { ...initialExtractorValues, llm_id: llmId, diff --git a/web/src/pages/agent/utils.ts b/web/src/pages/agent/utils.ts index a876d57389..d9586af8c3 100644 --- a/web/src/pages/agent/utils.ts +++ b/web/src/pages/agent/utils.ts @@ -39,9 +39,9 @@ import { import { BeginFormSchemaType } from './form/begin-form/schema'; import { DataOperationsFormSchemaType } from './form/data-operations-form'; import { ExtractorFormSchemaType } from './form/extractor-form'; -import { HierarchicalMergerFormSchemaType } from './form/hierarchical-merger-form'; import { ParserFormSchemaType } from './form/parser-form'; -import { SplitterFormSchemaType } from './form/splitter-form'; +import { TitleChunkerFormSchemaType } from './form/title-chunker-form'; +import { TokenChunkerFormSchemaType } from './form/token-chunker-form'; import { BeginQuery, IPosition } from './interface'; function buildAgentExceptionGoto(edges: Edge[], nodeId: string) { @@ -211,7 +211,10 @@ function transformParserParams(params: ParserFormSchemaType) { >((pre, cur) => { if (cur.fileFormat) { let filteredSetup: Partial< - ParserFormSchemaType['setups'][0] & { suffix: string[] } + ParserFormSchemaType['setups'][0] & { suffix: string[] } & { + two_column_check: boolean; + enable_multi_column: boolean; + } > = { output_format: cur.output_format, preprocess: cur.preprocess, @@ -224,6 +227,9 @@ function transformParserParams(params: ParserFormSchemaType) { ...filteredSetup, parse_method: cur.parse_method, lang: cur.lang, + vlm: { llm_id: cur.vlm?.llm_id }, + enable_multi_column: cur.enable_multi_column, + remove_toc: cur.remove_toc, }; // Only include TCADP parameters if TCADP Parser is selected if (cur.parse_method?.toLowerCase() === 'tcadp parser') { @@ -236,6 +242,7 @@ function transformParserParams(params: ParserFormSchemaType) { filteredSetup = { ...filteredSetup, parse_method: cur.parse_method, + vlm: { llm_id: cur.vlm?.llm_id }, }; // Only include TCADP parameters if TCADP Parser is selected if (cur.parse_method?.toLowerCase() === 'tcadp parser') { @@ -271,10 +278,12 @@ function transformParserParams(params: ParserFormSchemaType) { }; break; case FileType.Video: + case FileType.Docx: case FileType.Audio: + case FileType.TextMarkdown: filteredSetup = { ...filteredSetup, - llm_id: cur.llm_id, + vlm: { llm_id: cur.vlm?.llm_id }, }; break; default: @@ -289,13 +298,16 @@ function transformParserParams(params: ParserFormSchemaType) { return { ...params, setups }; } -function transformSplitterParams(params: SplitterFormSchemaType) { +function transformTokenChunkerParams(params: TokenChunkerFormSchemaType) { const { image_table_context_window, ...rest } = params; const imageTableContextWindow = Number(image_table_context_window || 0); return { ...rest, overlapped_percent: Number(params.overlapped_percent) / 100, - delimiters: transformObjectArrayToPureArray(params.delimiters, 'value'), + delimiters: + params.delimiter_mode === 'delimiter' + ? transformObjectArrayToPureArray(params.delimiters, 'value') + : [], table_context_size: imageTableContextWindow, image_context_size: imageTableContextWindow, @@ -306,14 +318,17 @@ function transformSplitterParams(params: SplitterFormSchemaType) { }; } -function transformHierarchicalMergerParams( - params: HierarchicalMergerFormSchemaType, -) { - const levels = params.levels.map((x) => - transformObjectArrayToPureArray(x.expressions, 'expression'), +function transformTitleChunkerParams(params: TitleChunkerFormSchemaType) { + const levels = params.rules.map((rule) => + transformObjectArrayToPureArray(rule.levels, 'expression'), ); - return { ...params, hierarchy: Number(params.hierarchy), levels }; + return { + method: params.method, + hierarchy: Number(params.hierarchy || 0), + include_heading_content: Boolean(params.include_heading_content), + levels, + }; } function transformExtractorParams(params: ExtractorFormSchemaType) { @@ -437,12 +452,12 @@ export const buildDslComponentsByGraph = ( params = transformParserParams(params); break; - case Operator.Splitter: - params = transformSplitterParams(params); + case Operator.TokenChunker: + params = transformTokenChunkerParams(params); break; - case Operator.HierarchicalMerger: - params = transformHierarchicalMergerParams(params); + case Operator.TitleChunker: + params = transformTitleChunkerParams(params); break; case Operator.Extractor: params = transformExtractorParams(params); diff --git a/web/src/pages/dataflow-result/components/time-line/index.tsx b/web/src/pages/dataflow-result/components/time-line/index.tsx index e153d92502..1e96eb216e 100644 --- a/web/src/pages/dataflow-result/components/time-line/index.tsx +++ b/web/src/pages/dataflow-result/components/time-line/index.tsx @@ -32,12 +32,12 @@ export const TimelineNodeObj = { title: 'Context Generator', icon: , }, - [TimelineNodeType.titleSplitter]: { - title: 'Title Splitter', + [TimelineNodeType.titleChunker]: { + title: 'Title Chunker', icon: , }, - [TimelineNodeType.characterSplitter]: { - title: 'Character Splitter', + [TimelineNodeType.tokenChunker]: { + title: 'Token Chunker', icon: , }, [TimelineNodeType.tokenizer]: { diff --git a/web/src/pages/dataflow-result/constant.ts b/web/src/pages/dataflow-result/constant.ts index 6d30ce122a..bf09b3b483 100644 --- a/web/src/pages/dataflow-result/constant.ts +++ b/web/src/pages/dataflow-result/constant.ts @@ -7,8 +7,8 @@ export enum TimelineNodeType { begin = 'file', parser = 'parser', contextGenerator = 'extractor', - titleSplitter = 'hierarchicalMerger', - characterSplitter = 'splitter', + titleChunker = 'titleChunker', + tokenChunker = 'tokenChunker', tokenizer = 'tokenizer', end = 'end', } diff --git a/web/src/pages/dataflow-result/hooks.ts b/web/src/pages/dataflow-result/hooks.ts index f0e4b8d1ed..4a83d3552e 100644 --- a/web/src/pages/dataflow-result/hooks.ts +++ b/web/src/pages/dataflow-result/hooks.ts @@ -233,10 +233,10 @@ export const useTimelineDataFlow = (data: IPipelineFileLogDetail) => { } else if (name === TimelineNodeType.tokenizer) { tempType = TimelineNodeType.tokenizer; } else if ( - name === TimelineNodeType.characterSplitter || - name === TimelineNodeType.titleSplitter + name === TimelineNodeType.tokenChunker || + name === TimelineNodeType.titleChunker ) { - tempType = TimelineNodeType.characterSplitter; + tempType = name; } const timeNode = { ...TimelineNodeObj[name], diff --git a/web/src/pages/dataflow-result/index.tsx b/web/src/pages/dataflow-result/index.tsx index ada172a17a..a0651bcd23 100644 --- a/web/src/pages/dataflow-result/index.tsx +++ b/web/src/pages/dataflow-result/index.tsx @@ -205,8 +205,8 @@ const DataflowResult = () => { )} */} {/* {currentTimeNode?.type === TimelineNodeType.parser && ( */} {(currentTimeNode?.type === TimelineNodeType.parser || - currentTimeNode?.type === TimelineNodeType.characterSplitter || - currentTimeNode?.type === TimelineNodeType.titleSplitter || + currentTimeNode?.type === TimelineNodeType.tokenChunker || + currentTimeNode?.type === TimelineNodeType.titleChunker || currentTimeNode?.type === TimelineNodeType.contextGenerator) && ( { ); const isChunck = - step?.type === TimelineNodeType.characterSplitter || - step?.type === TimelineNodeType.titleSplitter; + step?.type === TimelineNodeType.tokenChunker || + step?.type === TimelineNodeType.titleChunker; const handleCreateChunk = useCallback( (text: string) => { @@ -214,8 +214,8 @@ const ParserContainer = (props: IProps) => { isChunck={isChunck} textMode={textMode} isDelete={ - step?.type === TimelineNodeType.characterSplitter || - step?.type === TimelineNodeType.titleSplitter + step?.type === TimelineNodeType.tokenChunker || + step?.type === TimelineNodeType.titleChunker } clickChunk={clickChunk} handleCheckboxClick={handleCheckboxClick}