mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-04 09:39:32 +08:00
### What problem does this PR solve? Closes #16414. The **Crawler** agent tool (`agent/tools/crawler.py`) was never ported to the modern `ToolBase`/`_invoke` interface during the agent module redesign, so it was broken in three independent ways: 1. **Crashed on construction.** `CrawlerParam` extends `ToolParamBase`, whose `__init__` reads `self.meta["parameters"]`, but `CrawlerParam` defined no `meta`. Constructing it raised `AttributeError: 'CrawlerParam' object has no attribute 'meta'`. Because `agent/canvas.py` instantiates `component_class(component_name + "Param")()` while loading a canvas, **any agent containing a Crawler node failed to load.** 2. **`_invoke` missing.** It extends `ToolBase` (whose `invoke()` dispatches to `self._invoke`) but only implemented the legacy `_run`, so `_invoke` resolved to `ComponentBase._invoke` → `NotImplementedError`. 3. **`be_output` removed.** `_run` called `Crawler.be_output(...)`, which no longer exists on the base classes. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) ### Changes - Add a `ToolMeta` to `CrawlerParam` (defined before `super().__init__()`, matching every other ported tool such as `ArXivParam`/`TavilyExtractParam`) advertising a required `query` parameter — the URL to crawl, default `{sys.query}`, consistent with the `{sys.query}` convention shared by the other tools. - Replace the legacy `_run`/`be_output` with `_invoke`/`set_output`, writing the extracted page content to `formalized_content` (errors surfaced via `_ERROR`), consistent with the other tools. - Preserve the existing SSRF guard (`assert_url_is_safe` + `pin_dns_global`). - Add regression tests (`test/unit_test/agent/component/test_crawler.py`) covering param construction, validation, and the tool descriptor. Same class of defect as #16329 (DeepL). Backend-only; no frontend changes. --------- Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
120 lines
4.0 KiB
Python
120 lines
4.0 KiB
Python
#
|
|
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
import logging
|
|
import os
|
|
from abc import ABC
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler
|
|
from agent.tools.base import ToolMeta, ToolParamBase, ToolBase
|
|
from common.connection_utils import timeout
|
|
|
|
|
|
class CrawlerParam(ToolParamBase):
|
|
"""
|
|
Define the Crawler component parameters.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.meta: ToolMeta = {
|
|
"name": "web_crawler",
|
|
"description": "This tool can be used to crawl a web page and return its content as HTML, Markdown, or the extracted main text.",
|
|
"parameters": {
|
|
"query": {
|
|
"type": "string",
|
|
"description": "The absolute URL (including the http:// or https:// scheme) of the web page to crawl.",
|
|
"default": "{sys.query}",
|
|
"required": True,
|
|
}
|
|
},
|
|
}
|
|
super().__init__()
|
|
self.proxy = None
|
|
self.extract_type = "markdown"
|
|
|
|
def check(self):
|
|
self.check_valid_value(self.extract_type, "Type of content from the crawler", ["html", "markdown", "content"])
|
|
|
|
def get_input_form(self) -> dict[str, dict]:
|
|
return {
|
|
"query": {
|
|
"name": "URL",
|
|
"type": "line"
|
|
}
|
|
}
|
|
|
|
|
|
class Crawler(ToolBase, ABC):
|
|
component_name = "Crawler"
|
|
|
|
@timeout(int(os.environ.get("COMPONENT_EXEC_TIMEOUT", 10 * 60)))
|
|
def _invoke(self, **kwargs):
|
|
from common.ssrf_guard import assert_url_is_safe, pin_dns_global
|
|
|
|
if self.check_if_canceled("Crawler processing"):
|
|
return
|
|
|
|
url = kwargs.get("query")
|
|
if not url:
|
|
self.set_output("formalized_content", "")
|
|
return ""
|
|
|
|
try:
|
|
_ssrf_hostname, _ssrf_ip = assert_url_is_safe(url)
|
|
except ValueError:
|
|
msg = "URL not valid"
|
|
self.set_output("_ERROR", msg)
|
|
return msg
|
|
|
|
try:
|
|
# pin_dns_global is used (not thread-local) because crawl4ai resolves
|
|
# DNS in asyncio executor threads that don't share thread-local state.
|
|
with pin_dns_global(_ssrf_hostname, _ssrf_ip):
|
|
result = asyncio.run(self.get_web(url))
|
|
|
|
if self.check_if_canceled("Crawler processing"):
|
|
return
|
|
|
|
result = result or ""
|
|
self.set_output("formalized_content", result)
|
|
return result
|
|
except Exception as e:
|
|
if self.check_if_canceled("Crawler processing"):
|
|
return
|
|
|
|
logging.exception(f"Crawler error: {e}")
|
|
msg = f"An unexpected error occurred: {str(e)}"
|
|
self.set_output("_ERROR", msg)
|
|
return msg
|
|
|
|
async def get_web(self, url):
|
|
if self.check_if_canceled("Crawler async operation"):
|
|
return
|
|
|
|
proxy = self._param.proxy if self._param.proxy else None
|
|
async with AsyncWebCrawler(verbose=True, proxy=proxy) as crawler:
|
|
result = await crawler.arun(url=url, bypass_cache=True)
|
|
|
|
if self.check_if_canceled("Crawler async operation"):
|
|
return
|
|
|
|
if self._param.extract_type == "html":
|
|
return result.cleaned_html
|
|
elif self._param.extract_type == "markdown":
|
|
return result.markdown
|
|
elif self._param.extract_type == "content":
|
|
return result.extracted_content
|
|
return result.markdown
|