2024-10-21 11:38:41 +08:00
|
|
|
#
|
|
|
|
|
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
|
|
|
|
#
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
#
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
#
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
#
|
fix(agent/tools): port Crawler to ToolBase so it can load and run (#16415)
### What problem does this PR solve?
Closes #16414.
The **Crawler** agent tool (`agent/tools/crawler.py`) was never ported
to the modern `ToolBase`/`_invoke` interface during the agent module
redesign, so it was broken in three independent ways:
1. **Crashed on construction.** `CrawlerParam` extends `ToolParamBase`,
whose `__init__` reads `self.meta["parameters"]`, but `CrawlerParam`
defined no `meta`. Constructing it raised `AttributeError:
'CrawlerParam' object has no attribute 'meta'`. Because
`agent/canvas.py` instantiates `component_class(component_name +
"Param")()` while loading a canvas, **any agent containing a Crawler
node failed to load.**
2. **`_invoke` missing.** It extends `ToolBase` (whose `invoke()`
dispatches to `self._invoke`) but only implemented the legacy `_run`, so
`_invoke` resolved to `ComponentBase._invoke` → `NotImplementedError`.
3. **`be_output` removed.** `_run` called `Crawler.be_output(...)`,
which no longer exists on the base classes.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
### Changes
- Add a `ToolMeta` to `CrawlerParam` (defined before
`super().__init__()`, matching every other ported tool such as
`ArXivParam`/`TavilyExtractParam`) advertising a required `query`
parameter — the URL to crawl, default `{sys.query}`, consistent with the
`{sys.query}` convention shared by the other tools.
- Replace the legacy `_run`/`be_output` with `_invoke`/`set_output`,
writing the extracted page content to `formalized_content` (errors
surfaced via `_ERROR`), consistent with the other tools.
- Preserve the existing SSRF guard (`assert_url_is_safe` +
`pin_dns_global`).
- Add regression tests
(`test/unit_test/agent/component/test_crawler.py`) covering param
construction, validation, and the tool descriptor.
Same class of defect as #16329 (DeepL). Backend-only; no frontend
changes.
---------
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
2026-07-03 14:15:48 +05:00
|
|
|
import logging
|
|
|
|
|
import os
|
2024-10-21 11:38:41 +08:00
|
|
|
from abc import ABC
|
|
|
|
|
import asyncio
|
|
|
|
|
from crawl4ai import AsyncWebCrawler
|
fix(agent/tools): port Crawler to ToolBase so it can load and run (#16415)
### What problem does this PR solve?
Closes #16414.
The **Crawler** agent tool (`agent/tools/crawler.py`) was never ported
to the modern `ToolBase`/`_invoke` interface during the agent module
redesign, so it was broken in three independent ways:
1. **Crashed on construction.** `CrawlerParam` extends `ToolParamBase`,
whose `__init__` reads `self.meta["parameters"]`, but `CrawlerParam`
defined no `meta`. Constructing it raised `AttributeError:
'CrawlerParam' object has no attribute 'meta'`. Because
`agent/canvas.py` instantiates `component_class(component_name +
"Param")()` while loading a canvas, **any agent containing a Crawler
node failed to load.**
2. **`_invoke` missing.** It extends `ToolBase` (whose `invoke()`
dispatches to `self._invoke`) but only implemented the legacy `_run`, so
`_invoke` resolved to `ComponentBase._invoke` → `NotImplementedError`.
3. **`be_output` removed.** `_run` called `Crawler.be_output(...)`,
which no longer exists on the base classes.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
### Changes
- Add a `ToolMeta` to `CrawlerParam` (defined before
`super().__init__()`, matching every other ported tool such as
`ArXivParam`/`TavilyExtractParam`) advertising a required `query`
parameter — the URL to crawl, default `{sys.query}`, consistent with the
`{sys.query}` convention shared by the other tools.
- Replace the legacy `_run`/`be_output` with `_invoke`/`set_output`,
writing the extracted page content to `formalized_content` (errors
surfaced via `_ERROR`), consistent with the other tools.
- Preserve the existing SSRF guard (`assert_url_is_safe` +
`pin_dns_global`).
- Add regression tests
(`test/unit_test/agent/component/test_crawler.py`) covering param
construction, validation, and the tool descriptor.
Same class of defect as #16329 (DeepL). Backend-only; no frontend
changes.
---------
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
2026-07-03 14:15:48 +05:00
|
|
|
from agent.tools.base import ToolMeta, ToolParamBase, ToolBase
|
|
|
|
|
from common.connection_utils import timeout
|
2025-08-28 18:40:32 +08:00
|
|
|
|
2024-10-21 11:38:41 +08:00
|
|
|
|
2025-07-30 19:41:09 +08:00
|
|
|
class CrawlerParam(ToolParamBase):
|
2024-10-21 11:38:41 +08:00
|
|
|
"""
|
|
|
|
|
Define the Crawler component parameters.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
fix(agent/tools): port Crawler to ToolBase so it can load and run (#16415)
### What problem does this PR solve?
Closes #16414.
The **Crawler** agent tool (`agent/tools/crawler.py`) was never ported
to the modern `ToolBase`/`_invoke` interface during the agent module
redesign, so it was broken in three independent ways:
1. **Crashed on construction.** `CrawlerParam` extends `ToolParamBase`,
whose `__init__` reads `self.meta["parameters"]`, but `CrawlerParam`
defined no `meta`. Constructing it raised `AttributeError:
'CrawlerParam' object has no attribute 'meta'`. Because
`agent/canvas.py` instantiates `component_class(component_name +
"Param")()` while loading a canvas, **any agent containing a Crawler
node failed to load.**
2. **`_invoke` missing.** It extends `ToolBase` (whose `invoke()`
dispatches to `self._invoke`) but only implemented the legacy `_run`, so
`_invoke` resolved to `ComponentBase._invoke` → `NotImplementedError`.
3. **`be_output` removed.** `_run` called `Crawler.be_output(...)`,
which no longer exists on the base classes.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
### Changes
- Add a `ToolMeta` to `CrawlerParam` (defined before
`super().__init__()`, matching every other ported tool such as
`ArXivParam`/`TavilyExtractParam`) advertising a required `query`
parameter — the URL to crawl, default `{sys.query}`, consistent with the
`{sys.query}` convention shared by the other tools.
- Replace the legacy `_run`/`be_output` with `_invoke`/`set_output`,
writing the extracted page content to `formalized_content` (errors
surfaced via `_ERROR`), consistent with the other tools.
- Preserve the existing SSRF guard (`assert_url_is_safe` +
`pin_dns_global`).
- Add regression tests
(`test/unit_test/agent/component/test_crawler.py`) covering param
construction, validation, and the tool descriptor.
Same class of defect as #16329 (DeepL). Backend-only; no frontend
changes.
---------
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
2026-07-03 14:15:48 +05:00
|
|
|
self.meta: ToolMeta = {
|
|
|
|
|
"name": "web_crawler",
|
|
|
|
|
"description": "This tool can be used to crawl a web page and return its content as HTML, Markdown, or the extracted main text.",
|
|
|
|
|
"parameters": {
|
|
|
|
|
"query": {
|
|
|
|
|
"type": "string",
|
|
|
|
|
"description": "The absolute URL (including the http:// or https:// scheme) of the web page to crawl.",
|
|
|
|
|
"default": "{sys.query}",
|
|
|
|
|
"required": True,
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
}
|
2024-10-21 11:38:41 +08:00
|
|
|
super().__init__()
|
2024-10-22 14:16:44 +08:00
|
|
|
self.proxy = None
|
|
|
|
|
self.extract_type = "markdown"
|
2025-11-11 17:36:48 +08:00
|
|
|
|
2024-10-21 11:38:41 +08:00
|
|
|
def check(self):
|
2026-04-25 15:30:15 +09:00
|
|
|
self.check_valid_value(self.extract_type, "Type of content from the crawler", ["html", "markdown", "content"])
|
2024-10-21 11:38:41 +08:00
|
|
|
|
fix(agent/tools): port Crawler to ToolBase so it can load and run (#16415)
### What problem does this PR solve?
Closes #16414.
The **Crawler** agent tool (`agent/tools/crawler.py`) was never ported
to the modern `ToolBase`/`_invoke` interface during the agent module
redesign, so it was broken in three independent ways:
1. **Crashed on construction.** `CrawlerParam` extends `ToolParamBase`,
whose `__init__` reads `self.meta["parameters"]`, but `CrawlerParam`
defined no `meta`. Constructing it raised `AttributeError:
'CrawlerParam' object has no attribute 'meta'`. Because
`agent/canvas.py` instantiates `component_class(component_name +
"Param")()` while loading a canvas, **any agent containing a Crawler
node failed to load.**
2. **`_invoke` missing.** It extends `ToolBase` (whose `invoke()`
dispatches to `self._invoke`) but only implemented the legacy `_run`, so
`_invoke` resolved to `ComponentBase._invoke` → `NotImplementedError`.
3. **`be_output` removed.** `_run` called `Crawler.be_output(...)`,
which no longer exists on the base classes.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
### Changes
- Add a `ToolMeta` to `CrawlerParam` (defined before
`super().__init__()`, matching every other ported tool such as
`ArXivParam`/`TavilyExtractParam`) advertising a required `query`
parameter — the URL to crawl, default `{sys.query}`, consistent with the
`{sys.query}` convention shared by the other tools.
- Replace the legacy `_run`/`be_output` with `_invoke`/`set_output`,
writing the extracted page content to `formalized_content` (errors
surfaced via `_ERROR`), consistent with the other tools.
- Preserve the existing SSRF guard (`assert_url_is_safe` +
`pin_dns_global`).
- Add regression tests
(`test/unit_test/agent/component/test_crawler.py`) covering param
construction, validation, and the tool descriptor.
Same class of defect as #16329 (DeepL). Backend-only; no frontend
changes.
---------
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
2026-07-03 14:15:48 +05:00
|
|
|
def get_input_form(self) -> dict[str, dict]:
|
|
|
|
|
return {
|
|
|
|
|
"query": {
|
|
|
|
|
"name": "URL",
|
|
|
|
|
"type": "line"
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-10-21 11:38:41 +08:00
|
|
|
|
2025-07-30 19:41:09 +08:00
|
|
|
class Crawler(ToolBase, ABC):
|
2024-10-21 11:38:41 +08:00
|
|
|
component_name = "Crawler"
|
|
|
|
|
|
fix(agent/tools): port Crawler to ToolBase so it can load and run (#16415)
### What problem does this PR solve?
Closes #16414.
The **Crawler** agent tool (`agent/tools/crawler.py`) was never ported
to the modern `ToolBase`/`_invoke` interface during the agent module
redesign, so it was broken in three independent ways:
1. **Crashed on construction.** `CrawlerParam` extends `ToolParamBase`,
whose `__init__` reads `self.meta["parameters"]`, but `CrawlerParam`
defined no `meta`. Constructing it raised `AttributeError:
'CrawlerParam' object has no attribute 'meta'`. Because
`agent/canvas.py` instantiates `component_class(component_name +
"Param")()` while loading a canvas, **any agent containing a Crawler
node failed to load.**
2. **`_invoke` missing.** It extends `ToolBase` (whose `invoke()`
dispatches to `self._invoke`) but only implemented the legacy `_run`, so
`_invoke` resolved to `ComponentBase._invoke` → `NotImplementedError`.
3. **`be_output` removed.** `_run` called `Crawler.be_output(...)`,
which no longer exists on the base classes.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
### Changes
- Add a `ToolMeta` to `CrawlerParam` (defined before
`super().__init__()`, matching every other ported tool such as
`ArXivParam`/`TavilyExtractParam`) advertising a required `query`
parameter — the URL to crawl, default `{sys.query}`, consistent with the
`{sys.query}` convention shared by the other tools.
- Replace the legacy `_run`/`be_output` with `_invoke`/`set_output`,
writing the extracted page content to `formalized_content` (errors
surfaced via `_ERROR`), consistent with the other tools.
- Preserve the existing SSRF guard (`assert_url_is_safe` +
`pin_dns_global`).
- Add regression tests
(`test/unit_test/agent/component/test_crawler.py`) covering param
construction, validation, and the tool descriptor.
Same class of defect as #16329 (DeepL). Backend-only; no frontend
changes.
---------
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
2026-07-03 14:15:48 +05:00
|
|
|
@timeout(int(os.environ.get("COMPONENT_EXEC_TIMEOUT", 10 * 60)))
|
|
|
|
|
def _invoke(self, **kwargs):
|
2026-04-25 15:30:15 +09:00
|
|
|
from common.ssrf_guard import assert_url_is_safe, pin_dns_global
|
|
|
|
|
|
fix(agent/tools): port Crawler to ToolBase so it can load and run (#16415)
### What problem does this PR solve?
Closes #16414.
The **Crawler** agent tool (`agent/tools/crawler.py`) was never ported
to the modern `ToolBase`/`_invoke` interface during the agent module
redesign, so it was broken in three independent ways:
1. **Crashed on construction.** `CrawlerParam` extends `ToolParamBase`,
whose `__init__` reads `self.meta["parameters"]`, but `CrawlerParam`
defined no `meta`. Constructing it raised `AttributeError:
'CrawlerParam' object has no attribute 'meta'`. Because
`agent/canvas.py` instantiates `component_class(component_name +
"Param")()` while loading a canvas, **any agent containing a Crawler
node failed to load.**
2. **`_invoke` missing.** It extends `ToolBase` (whose `invoke()`
dispatches to `self._invoke`) but only implemented the legacy `_run`, so
`_invoke` resolved to `ComponentBase._invoke` → `NotImplementedError`.
3. **`be_output` removed.** `_run` called `Crawler.be_output(...)`,
which no longer exists on the base classes.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
### Changes
- Add a `ToolMeta` to `CrawlerParam` (defined before
`super().__init__()`, matching every other ported tool such as
`ArXivParam`/`TavilyExtractParam`) advertising a required `query`
parameter — the URL to crawl, default `{sys.query}`, consistent with the
`{sys.query}` convention shared by the other tools.
- Replace the legacy `_run`/`be_output` with `_invoke`/`set_output`,
writing the extracted page content to `formalized_content` (errors
surfaced via `_ERROR`), consistent with the other tools.
- Preserve the existing SSRF guard (`assert_url_is_safe` +
`pin_dns_global`).
- Add regression tests
(`test/unit_test/agent/component/test_crawler.py`) covering param
construction, validation, and the tool descriptor.
Same class of defect as #16329 (DeepL). Backend-only; no frontend
changes.
---------
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
2026-07-03 14:15:48 +05:00
|
|
|
if self.check_if_canceled("Crawler processing"):
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
url = kwargs.get("query")
|
|
|
|
|
if not url:
|
|
|
|
|
self.set_output("formalized_content", "")
|
|
|
|
|
return ""
|
|
|
|
|
|
2026-04-25 15:30:15 +09:00
|
|
|
try:
|
fix(agent/tools): port Crawler to ToolBase so it can load and run (#16415)
### What problem does this PR solve?
Closes #16414.
The **Crawler** agent tool (`agent/tools/crawler.py`) was never ported
to the modern `ToolBase`/`_invoke` interface during the agent module
redesign, so it was broken in three independent ways:
1. **Crashed on construction.** `CrawlerParam` extends `ToolParamBase`,
whose `__init__` reads `self.meta["parameters"]`, but `CrawlerParam`
defined no `meta`. Constructing it raised `AttributeError:
'CrawlerParam' object has no attribute 'meta'`. Because
`agent/canvas.py` instantiates `component_class(component_name +
"Param")()` while loading a canvas, **any agent containing a Crawler
node failed to load.**
2. **`_invoke` missing.** It extends `ToolBase` (whose `invoke()`
dispatches to `self._invoke`) but only implemented the legacy `_run`, so
`_invoke` resolved to `ComponentBase._invoke` → `NotImplementedError`.
3. **`be_output` removed.** `_run` called `Crawler.be_output(...)`,
which no longer exists on the base classes.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
### Changes
- Add a `ToolMeta` to `CrawlerParam` (defined before
`super().__init__()`, matching every other ported tool such as
`ArXivParam`/`TavilyExtractParam`) advertising a required `query`
parameter — the URL to crawl, default `{sys.query}`, consistent with the
`{sys.query}` convention shared by the other tools.
- Replace the legacy `_run`/`be_output` with `_invoke`/`set_output`,
writing the extracted page content to `formalized_content` (errors
surfaced via `_ERROR`), consistent with the other tools.
- Preserve the existing SSRF guard (`assert_url_is_safe` +
`pin_dns_global`).
- Add regression tests
(`test/unit_test/agent/component/test_crawler.py`) covering param
construction, validation, and the tool descriptor.
Same class of defect as #16329 (DeepL). Backend-only; no frontend
changes.
---------
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
2026-07-03 14:15:48 +05:00
|
|
|
_ssrf_hostname, _ssrf_ip = assert_url_is_safe(url)
|
2026-04-25 15:30:15 +09:00
|
|
|
except ValueError:
|
fix(agent/tools): port Crawler to ToolBase so it can load and run (#16415)
### What problem does this PR solve?
Closes #16414.
The **Crawler** agent tool (`agent/tools/crawler.py`) was never ported
to the modern `ToolBase`/`_invoke` interface during the agent module
redesign, so it was broken in three independent ways:
1. **Crashed on construction.** `CrawlerParam` extends `ToolParamBase`,
whose `__init__` reads `self.meta["parameters"]`, but `CrawlerParam`
defined no `meta`. Constructing it raised `AttributeError:
'CrawlerParam' object has no attribute 'meta'`. Because
`agent/canvas.py` instantiates `component_class(component_name +
"Param")()` while loading a canvas, **any agent containing a Crawler
node failed to load.**
2. **`_invoke` missing.** It extends `ToolBase` (whose `invoke()`
dispatches to `self._invoke`) but only implemented the legacy `_run`, so
`_invoke` resolved to `ComponentBase._invoke` → `NotImplementedError`.
3. **`be_output` removed.** `_run` called `Crawler.be_output(...)`,
which no longer exists on the base classes.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
### Changes
- Add a `ToolMeta` to `CrawlerParam` (defined before
`super().__init__()`, matching every other ported tool such as
`ArXivParam`/`TavilyExtractParam`) advertising a required `query`
parameter — the URL to crawl, default `{sys.query}`, consistent with the
`{sys.query}` convention shared by the other tools.
- Replace the legacy `_run`/`be_output` with `_invoke`/`set_output`,
writing the extracted page content to `formalized_content` (errors
surfaced via `_ERROR`), consistent with the other tools.
- Preserve the existing SSRF guard (`assert_url_is_safe` +
`pin_dns_global`).
- Add regression tests
(`test/unit_test/agent/component/test_crawler.py`) covering param
construction, validation, and the tool descriptor.
Same class of defect as #16329 (DeepL). Backend-only; no frontend
changes.
---------
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
2026-07-03 14:15:48 +05:00
|
|
|
msg = "URL not valid"
|
|
|
|
|
self.set_output("_ERROR", msg)
|
|
|
|
|
return msg
|
|
|
|
|
|
2024-10-21 11:38:41 +08:00
|
|
|
try:
|
2026-04-25 15:30:15 +09:00
|
|
|
# pin_dns_global is used (not thread-local) because crawl4ai resolves
|
|
|
|
|
# DNS in asyncio executor threads that don't share thread-local state.
|
|
|
|
|
with pin_dns_global(_ssrf_hostname, _ssrf_ip):
|
fix(agent/tools): port Crawler to ToolBase so it can load and run (#16415)
### What problem does this PR solve?
Closes #16414.
The **Crawler** agent tool (`agent/tools/crawler.py`) was never ported
to the modern `ToolBase`/`_invoke` interface during the agent module
redesign, so it was broken in three independent ways:
1. **Crashed on construction.** `CrawlerParam` extends `ToolParamBase`,
whose `__init__` reads `self.meta["parameters"]`, but `CrawlerParam`
defined no `meta`. Constructing it raised `AttributeError:
'CrawlerParam' object has no attribute 'meta'`. Because
`agent/canvas.py` instantiates `component_class(component_name +
"Param")()` while loading a canvas, **any agent containing a Crawler
node failed to load.**
2. **`_invoke` missing.** It extends `ToolBase` (whose `invoke()`
dispatches to `self._invoke`) but only implemented the legacy `_run`, so
`_invoke` resolved to `ComponentBase._invoke` → `NotImplementedError`.
3. **`be_output` removed.** `_run` called `Crawler.be_output(...)`,
which no longer exists on the base classes.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
### Changes
- Add a `ToolMeta` to `CrawlerParam` (defined before
`super().__init__()`, matching every other ported tool such as
`ArXivParam`/`TavilyExtractParam`) advertising a required `query`
parameter — the URL to crawl, default `{sys.query}`, consistent with the
`{sys.query}` convention shared by the other tools.
- Replace the legacy `_run`/`be_output` with `_invoke`/`set_output`,
writing the extracted page content to `formalized_content` (errors
surfaced via `_ERROR`), consistent with the other tools.
- Preserve the existing SSRF guard (`assert_url_is_safe` +
`pin_dns_global`).
- Add regression tests
(`test/unit_test/agent/component/test_crawler.py`) covering param
construction, validation, and the tool descriptor.
Same class of defect as #16329 (DeepL). Backend-only; no frontend
changes.
---------
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
2026-07-03 14:15:48 +05:00
|
|
|
result = asyncio.run(self.get_web(url))
|
2024-10-21 11:38:41 +08:00
|
|
|
|
fix(agent/tools): port Crawler to ToolBase so it can load and run (#16415)
### What problem does this PR solve?
Closes #16414.
The **Crawler** agent tool (`agent/tools/crawler.py`) was never ported
to the modern `ToolBase`/`_invoke` interface during the agent module
redesign, so it was broken in three independent ways:
1. **Crashed on construction.** `CrawlerParam` extends `ToolParamBase`,
whose `__init__` reads `self.meta["parameters"]`, but `CrawlerParam`
defined no `meta`. Constructing it raised `AttributeError:
'CrawlerParam' object has no attribute 'meta'`. Because
`agent/canvas.py` instantiates `component_class(component_name +
"Param")()` while loading a canvas, **any agent containing a Crawler
node failed to load.**
2. **`_invoke` missing.** It extends `ToolBase` (whose `invoke()`
dispatches to `self._invoke`) but only implemented the legacy `_run`, so
`_invoke` resolved to `ComponentBase._invoke` → `NotImplementedError`.
3. **`be_output` removed.** `_run` called `Crawler.be_output(...)`,
which no longer exists on the base classes.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
### Changes
- Add a `ToolMeta` to `CrawlerParam` (defined before
`super().__init__()`, matching every other ported tool such as
`ArXivParam`/`TavilyExtractParam`) advertising a required `query`
parameter — the URL to crawl, default `{sys.query}`, consistent with the
`{sys.query}` convention shared by the other tools.
- Replace the legacy `_run`/`be_output` with `_invoke`/`set_output`,
writing the extracted page content to `formalized_content` (errors
surfaced via `_ERROR`), consistent with the other tools.
- Preserve the existing SSRF guard (`assert_url_is_safe` +
`pin_dns_global`).
- Add regression tests
(`test/unit_test/agent/component/test_crawler.py`) covering param
construction, validation, and the tool descriptor.
Same class of defect as #16329 (DeepL). Backend-only; no frontend
changes.
---------
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
2026-07-03 14:15:48 +05:00
|
|
|
if self.check_if_canceled("Crawler processing"):
|
|
|
|
|
return
|
2025-11-11 17:36:48 +08:00
|
|
|
|
fix(agent/tools): port Crawler to ToolBase so it can load and run (#16415)
### What problem does this PR solve?
Closes #16414.
The **Crawler** agent tool (`agent/tools/crawler.py`) was never ported
to the modern `ToolBase`/`_invoke` interface during the agent module
redesign, so it was broken in three independent ways:
1. **Crashed on construction.** `CrawlerParam` extends `ToolParamBase`,
whose `__init__` reads `self.meta["parameters"]`, but `CrawlerParam`
defined no `meta`. Constructing it raised `AttributeError:
'CrawlerParam' object has no attribute 'meta'`. Because
`agent/canvas.py` instantiates `component_class(component_name +
"Param")()` while loading a canvas, **any agent containing a Crawler
node failed to load.**
2. **`_invoke` missing.** It extends `ToolBase` (whose `invoke()`
dispatches to `self._invoke`) but only implemented the legacy `_run`, so
`_invoke` resolved to `ComponentBase._invoke` → `NotImplementedError`.
3. **`be_output` removed.** `_run` called `Crawler.be_output(...)`,
which no longer exists on the base classes.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
### Changes
- Add a `ToolMeta` to `CrawlerParam` (defined before
`super().__init__()`, matching every other ported tool such as
`ArXivParam`/`TavilyExtractParam`) advertising a required `query`
parameter — the URL to crawl, default `{sys.query}`, consistent with the
`{sys.query}` convention shared by the other tools.
- Replace the legacy `_run`/`be_output` with `_invoke`/`set_output`,
writing the extracted page content to `formalized_content` (errors
surfaced via `_ERROR`), consistent with the other tools.
- Preserve the existing SSRF guard (`assert_url_is_safe` +
`pin_dns_global`).
- Add regression tests
(`test/unit_test/agent/component/test_crawler.py`) covering param
construction, validation, and the tool descriptor.
Same class of defect as #16329 (DeepL). Backend-only; no frontend
changes.
---------
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
2026-07-03 14:15:48 +05:00
|
|
|
result = result or ""
|
|
|
|
|
self.set_output("formalized_content", result)
|
|
|
|
|
return result
|
2024-10-21 11:38:41 +08:00
|
|
|
except Exception as e:
|
fix(agent/tools): port Crawler to ToolBase so it can load and run (#16415)
### What problem does this PR solve?
Closes #16414.
The **Crawler** agent tool (`agent/tools/crawler.py`) was never ported
to the modern `ToolBase`/`_invoke` interface during the agent module
redesign, so it was broken in three independent ways:
1. **Crashed on construction.** `CrawlerParam` extends `ToolParamBase`,
whose `__init__` reads `self.meta["parameters"]`, but `CrawlerParam`
defined no `meta`. Constructing it raised `AttributeError:
'CrawlerParam' object has no attribute 'meta'`. Because
`agent/canvas.py` instantiates `component_class(component_name +
"Param")()` while loading a canvas, **any agent containing a Crawler
node failed to load.**
2. **`_invoke` missing.** It extends `ToolBase` (whose `invoke()`
dispatches to `self._invoke`) but only implemented the legacy `_run`, so
`_invoke` resolved to `ComponentBase._invoke` → `NotImplementedError`.
3. **`be_output` removed.** `_run` called `Crawler.be_output(...)`,
which no longer exists on the base classes.
### Type of change
- [x] Bug Fix (non-breaking change which fixes an issue)
### Changes
- Add a `ToolMeta` to `CrawlerParam` (defined before
`super().__init__()`, matching every other ported tool such as
`ArXivParam`/`TavilyExtractParam`) advertising a required `query`
parameter — the URL to crawl, default `{sys.query}`, consistent with the
`{sys.query}` convention shared by the other tools.
- Replace the legacy `_run`/`be_output` with `_invoke`/`set_output`,
writing the extracted page content to `formalized_content` (errors
surfaced via `_ERROR`), consistent with the other tools.
- Preserve the existing SSRF guard (`assert_url_is_safe` +
`pin_dns_global`).
- Add regression tests
(`test/unit_test/agent/component/test_crawler.py`) covering param
construction, validation, and the tool descriptor.
Same class of defect as #16329 (DeepL). Backend-only; no frontend
changes.
---------
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
2026-07-03 14:15:48 +05:00
|
|
|
if self.check_if_canceled("Crawler processing"):
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
logging.exception(f"Crawler error: {e}")
|
|
|
|
|
msg = f"An unexpected error occurred: {str(e)}"
|
|
|
|
|
self.set_output("_ERROR", msg)
|
|
|
|
|
return msg
|
2024-10-21 11:38:41 +08:00
|
|
|
|
|
|
|
|
async def get_web(self, url):
|
2025-11-11 17:36:48 +08:00
|
|
|
if self.check_if_canceled("Crawler async operation"):
|
|
|
|
|
return
|
|
|
|
|
|
2024-10-21 11:38:41 +08:00
|
|
|
proxy = self._param.proxy if self._param.proxy else None
|
|
|
|
|
async with AsyncWebCrawler(verbose=True, proxy=proxy) as crawler:
|
2026-04-25 15:30:15 +09:00
|
|
|
result = await crawler.arun(url=url, bypass_cache=True)
|
2025-11-11 17:36:48 +08:00
|
|
|
|
|
|
|
|
if self.check_if_canceled("Crawler async operation"):
|
|
|
|
|
return
|
|
|
|
|
|
2026-04-25 15:30:15 +09:00
|
|
|
if self._param.extract_type == "html":
|
2024-10-22 14:16:44 +08:00
|
|
|
return result.cleaned_html
|
2026-04-25 15:30:15 +09:00
|
|
|
elif self._param.extract_type == "markdown":
|
2024-10-22 14:16:44 +08:00
|
|
|
return result.markdown
|
2026-04-25 15:30:15 +09:00
|
|
|
elif self._param.extract_type == "content":
|
2025-09-05 12:31:44 +08:00
|
|
|
return result.extracted_content
|
2024-10-22 14:16:44 +08:00
|
|
|
return result.markdown
|