mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
fix: The output of the parser in the ingestion pipeline contains HTML tags (#14920)
## Summary This change fixes ingestion quality issues where MinerU parser output may contain HTML fragments (for example, table-related tags like `<tr>`, `<td>`, `<br>`), which were previously passed directly into chunking/tokenization and degraded chunk quality. The fix adds a sanitization step in the MinerU parser path so parsed sections are normalized to clean text before chunking. ## Change Type (select all) - [x] Bug fix - [x] Ingestion pipeline improvement - [x] Parser/chunking quality fix ## Related Issue - https://github.com/infiniflow/ragflow/issues/14831
This commit is contained in:
@@ -25,6 +25,7 @@ from api.utils.web_utils import get_float, safe_json_parse
|
||||
from common.constants import VALID_MCP_SERVER_TYPES
|
||||
from common.mcp_tool_call_conn import MCPToolCallSession, close_multiple_mcp_toolcall_sessions
|
||||
from common.misc_utils import get_uuid, thread_pool_exec
|
||||
from common.ssrf_guard import assert_url_is_safe, pin_dns_global
|
||||
|
||||
|
||||
def _get_mcp_ids_from_args() -> list[str]:
|
||||
@@ -55,6 +56,16 @@ def _export_mcp_servers(mcp_ids: list[str]) -> dict | None:
|
||||
return {"mcpServers": exported_servers}
|
||||
|
||||
|
||||
def _assert_mcp_url_is_safe(url, invalid_message: str = "Invalid url.") -> tuple[str, str, str | None]:
|
||||
if not isinstance(url, str) or not url:
|
||||
return "", "", invalid_message
|
||||
try:
|
||||
hostname, resolved_ip = assert_url_is_safe(url)
|
||||
except ValueError as exc:
|
||||
return "", "", str(exc)
|
||||
return hostname, resolved_ip, None
|
||||
|
||||
|
||||
@manager.route("/mcp/servers", methods=["GET"]) # noqa: F821
|
||||
@login_required
|
||||
async def list_mcp() -> Response:
|
||||
@@ -119,8 +130,9 @@ async def create() -> Response:
|
||||
return get_data_error_result(message="Duplicated MCP server name.")
|
||||
|
||||
url = req.get("url", "")
|
||||
if not url:
|
||||
return get_data_error_result(message="Invalid url.")
|
||||
hostname, resolved_ip, url_error = _assert_mcp_url_is_safe(url)
|
||||
if url_error:
|
||||
return get_data_error_result(message=url_error)
|
||||
|
||||
headers = safe_json_parse(req.get("headers", {}))
|
||||
req["headers"] = headers
|
||||
@@ -138,7 +150,8 @@ async def create() -> Response:
|
||||
return get_data_error_result(message="Tenant not found.")
|
||||
|
||||
mcp_server = MCPServer(id=server_name, name=server_name, url=url, server_type=server_type, variables=variables, headers=headers)
|
||||
server_tools, err_message = await thread_pool_exec(get_mcp_tools, [mcp_server], timeout)
|
||||
with pin_dns_global(hostname, resolved_ip):
|
||||
server_tools, err_message = await thread_pool_exec(get_mcp_tools, [mcp_server], timeout)
|
||||
if err_message:
|
||||
return get_data_error_result(message=err_message)
|
||||
|
||||
@@ -171,8 +184,9 @@ async def update(mcp_id: str) -> Response:
|
||||
if server_name and len(server_name.encode("utf-8")) > 255:
|
||||
return get_data_error_result(message=f"Invalid MCP name or length is {len(server_name)} which is large than 255.")
|
||||
url = req.get("url", mcp_server.url)
|
||||
if not url:
|
||||
return get_data_error_result(message="Invalid url.")
|
||||
hostname, resolved_ip, url_error = _assert_mcp_url_is_safe(url)
|
||||
if url_error:
|
||||
return get_data_error_result(message=url_error)
|
||||
|
||||
headers = safe_json_parse(req.get("headers", mcp_server.headers))
|
||||
req["headers"] = headers
|
||||
@@ -187,7 +201,8 @@ async def update(mcp_id: str) -> Response:
|
||||
req["id"] = mcp_id
|
||||
|
||||
mcp_server = MCPServer(id=server_name, name=server_name, url=url, server_type=server_type, variables=variables, headers=headers)
|
||||
server_tools, err_message = await thread_pool_exec(get_mcp_tools, [mcp_server], timeout)
|
||||
with pin_dns_global(hostname, resolved_ip):
|
||||
server_tools, err_message = await thread_pool_exec(get_mcp_tools, [mcp_server], timeout)
|
||||
if err_message:
|
||||
return get_data_error_result(message=err_message)
|
||||
|
||||
@@ -244,6 +259,13 @@ async def import_multiple() -> Response:
|
||||
if not server_name or len(server_name.encode("utf-8")) > 255:
|
||||
results.append({"server": server_name, "success": False, "message": f"Invalid MCP name or length is {len(server_name)} which is large than 255."})
|
||||
continue
|
||||
if config["type"] not in VALID_MCP_SERVER_TYPES:
|
||||
results.append({"server": server_name, "success": False, "message": "Unsupported MCP server type."})
|
||||
continue
|
||||
hostname, resolved_ip, url_error = _assert_mcp_url_is_safe(config["url"])
|
||||
if url_error:
|
||||
results.append({"server": server_name, "success": False, "message": url_error})
|
||||
continue
|
||||
|
||||
base_name = server_name
|
||||
new_name = base_name
|
||||
@@ -268,7 +290,8 @@ async def import_multiple() -> Response:
|
||||
headers = {"authorization_token": config["authorization_token"]} if "authorization_token" in config else {}
|
||||
variables = {k: v for k, v in config.items() if k not in {"type", "url", "headers"}}
|
||||
mcp_server = MCPServer(id=new_name, name=new_name, url=config["url"], server_type=config["type"], variables=variables, headers=headers)
|
||||
server_tools, err_message = await thread_pool_exec(get_mcp_tools, [mcp_server], timeout)
|
||||
with pin_dns_global(hostname, resolved_ip):
|
||||
server_tools, err_message = await thread_pool_exec(get_mcp_tools, [mcp_server], timeout)
|
||||
if err_message:
|
||||
results.append({"server": base_name, "success": False, "message": err_message})
|
||||
continue
|
||||
@@ -297,13 +320,17 @@ async def test_mcp(mcp_id: str) -> Response:
|
||||
req = await get_request_json()
|
||||
|
||||
url = req.get("url", "")
|
||||
if not url:
|
||||
if not isinstance(url, str) or not url:
|
||||
return get_data_error_result(message="Invalid MCP url.")
|
||||
|
||||
server_type = req.get("server_type", "")
|
||||
if server_type not in VALID_MCP_SERVER_TYPES:
|
||||
return get_data_error_result(message="Unsupported MCP server type.")
|
||||
|
||||
hostname, resolved_ip, url_error = _assert_mcp_url_is_safe(url, "Invalid MCP url.")
|
||||
if url_error:
|
||||
return get_data_error_result(message=url_error)
|
||||
|
||||
timeout = get_float(req, "timeout", 10)
|
||||
headers = safe_json_parse(req.get("headers", {}))
|
||||
variables = safe_json_parse(req.get("variables", {}))
|
||||
@@ -312,14 +339,15 @@ async def test_mcp(mcp_id: str) -> Response:
|
||||
|
||||
result = []
|
||||
try:
|
||||
tool_call_session = MCPToolCallSession(mcp_server, mcp_server.variables)
|
||||
with pin_dns_global(hostname, resolved_ip):
|
||||
tool_call_session = MCPToolCallSession(mcp_server, mcp_server.variables)
|
||||
|
||||
try:
|
||||
tools = await thread_pool_exec(tool_call_session.get_tools, timeout)
|
||||
except Exception as e:
|
||||
return get_data_error_result(message=f"Test MCP error: {e}")
|
||||
finally:
|
||||
await thread_pool_exec(close_multiple_mcp_toolcall_sessions, [tool_call_session])
|
||||
try:
|
||||
tools = await thread_pool_exec(tool_call_session.get_tools, timeout)
|
||||
except Exception as e:
|
||||
return get_data_error_result(message=f"Test MCP error: {e}")
|
||||
finally:
|
||||
await thread_pool_exec(close_multiple_mcp_toolcall_sessions, [tool_call_session])
|
||||
|
||||
for tool in tools:
|
||||
tool_dict = tool.model_dump()
|
||||
|
||||
Reference in New Issue
Block a user