Files
ragflow/agent/tools/crawler.py
Muhammad Furqan 3cba34d67f fix(agent/tools): port Crawler to ToolBase so it can load and run (#16415)
### What problem does this PR solve?

Closes #16414.

The **Crawler** agent tool (`agent/tools/crawler.py`) was never ported
to the modern `ToolBase`/`_invoke` interface during the agent module
redesign, so it was broken in three independent ways:

1. **Crashed on construction.** `CrawlerParam` extends `ToolParamBase`,
whose `__init__` reads `self.meta["parameters"]`, but `CrawlerParam`
defined no `meta`. Constructing it raised `AttributeError:
'CrawlerParam' object has no attribute 'meta'`. Because
`agent/canvas.py` instantiates `component_class(component_name +
"Param")()` while loading a canvas, **any agent containing a Crawler
node failed to load.**
2. **`_invoke` missing.** It extends `ToolBase` (whose `invoke()`
dispatches to `self._invoke`) but only implemented the legacy `_run`, so
`_invoke` resolved to `ComponentBase._invoke` → `NotImplementedError`.
3. **`be_output` removed.** `_run` called `Crawler.be_output(...)`,
which no longer exists on the base classes.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

### Changes

- Add a `ToolMeta` to `CrawlerParam` (defined before
`super().__init__()`, matching every other ported tool such as
`ArXivParam`/`TavilyExtractParam`) advertising a required `query`
parameter — the URL to crawl, default `{sys.query}`, consistent with the
`{sys.query}` convention shared by the other tools.
- Replace the legacy `_run`/`be_output` with `_invoke`/`set_output`,
writing the extracted page content to `formalized_content` (errors
surfaced via `_ERROR`), consistent with the other tools.
- Preserve the existing SSRF guard (`assert_url_is_safe` +
`pin_dns_global`).
- Add regression tests
(`test/unit_test/agent/component/test_crawler.py`) covering param
construction, validation, and the tool descriptor.

Same class of defect as #16329 (DeepL). Backend-only; no frontend
changes.

---------

Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
2026-07-03 17:15:48 +08:00

120 lines
4.0 KiB
Python

#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import logging
import os
from abc import ABC
import asyncio
from crawl4ai import AsyncWebCrawler
from agent.tools.base import ToolMeta, ToolParamBase, ToolBase
from common.connection_utils import timeout
class CrawlerParam(ToolParamBase):
"""
Define the Crawler component parameters.
"""
def __init__(self):
self.meta: ToolMeta = {
"name": "web_crawler",
"description": "This tool can be used to crawl a web page and return its content as HTML, Markdown, or the extracted main text.",
"parameters": {
"query": {
"type": "string",
"description": "The absolute URL (including the http:// or https:// scheme) of the web page to crawl.",
"default": "{sys.query}",
"required": True,
}
},
}
super().__init__()
self.proxy = None
self.extract_type = "markdown"
def check(self):
self.check_valid_value(self.extract_type, "Type of content from the crawler", ["html", "markdown", "content"])
def get_input_form(self) -> dict[str, dict]:
return {
"query": {
"name": "URL",
"type": "line"
}
}
class Crawler(ToolBase, ABC):
component_name = "Crawler"
@timeout(int(os.environ.get("COMPONENT_EXEC_TIMEOUT", 10 * 60)))
def _invoke(self, **kwargs):
from common.ssrf_guard import assert_url_is_safe, pin_dns_global
if self.check_if_canceled("Crawler processing"):
return
url = kwargs.get("query")
if not url:
self.set_output("formalized_content", "")
return ""
try:
_ssrf_hostname, _ssrf_ip = assert_url_is_safe(url)
except ValueError:
msg = "URL not valid"
self.set_output("_ERROR", msg)
return msg
try:
# pin_dns_global is used (not thread-local) because crawl4ai resolves
# DNS in asyncio executor threads that don't share thread-local state.
with pin_dns_global(_ssrf_hostname, _ssrf_ip):
result = asyncio.run(self.get_web(url))
if self.check_if_canceled("Crawler processing"):
return
result = result or ""
self.set_output("formalized_content", result)
return result
except Exception as e:
if self.check_if_canceled("Crawler processing"):
return
logging.exception(f"Crawler error: {e}")
msg = f"An unexpected error occurred: {str(e)}"
self.set_output("_ERROR", msg)
return msg
async def get_web(self, url):
if self.check_if_canceled("Crawler async operation"):
return
proxy = self._param.proxy if self._param.proxy else None
async with AsyncWebCrawler(verbose=True, proxy=proxy) as crawler:
result = await crawler.arun(url=url, bypass_cache=True)
if self.check_if_canceled("Crawler async operation"):
return
if self._param.extract_type == "html":
return result.cleaned_html
elif self._param.extract_type == "markdown":
return result.markdown
elif self._param.extract_type == "content":
return result.extracted_content
return result.markdown