mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
Fix: SSRF in markdown parser remote image fetch (#15438)
### What problem does this PR solve? `rag/app/naive.py` `Markdown.load_images_from_urls` fetched image URLs parsed straight out of an untrusted uploaded markdown document via a raw `requests.get`, with no SSRF validation. Markdown chunking always reaches this path (`return_section_images=True`), so any authenticated user who uploads a `.md`/`.markdown`/`.mdx` file to a knowledge base could make the server issue requests to internal services or cloud-metadata endpoints, e.g. ``. The `image/` Content-Type check only gates decoding — the outbound request (the SSRF) always fires. This was the one user-controlled fetch site missed by the project's existing SSRF-hardening (`common/ssrf_guard.py`, already applied to the crawler, SearXNG, RSS connector, MCP/document APIs, and OAuth avatar download). The fix validates and DNS-pins every hop with `common.ssrf_guard.assert_url_is_safe` before connecting, and follows redirects manually so each redirect target is re-validated (closing the DNS-rebinding / redirect-bypass window), mirroring `common/data_source/rss_connector.py`. Blocked URLs are skipped and logged like any other unreachable image, so legitimate public images are unaffected. Adds a regression test at `test/unit_test/rag/app/test_markdown_image_ssrf.py`. Closes #15437 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --------- Co-authored-by: Ubuntu <ubuntu@ubuntu-2204.linuxvmimages.local> Co-authored-by: galuis116 <galuis116@users.noreply.github.com>
This commit is contained in:
@@ -643,6 +643,11 @@ class Pdf(PdfParser):
|
||||
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
|
||||
|
||||
|
||||
# Maximum number of HTTP redirects followed when fetching a remote image
|
||||
# referenced by a markdown document (each hop is SSRF-validated).
|
||||
MAX_IMAGE_REDIRECTS = 5
|
||||
|
||||
|
||||
class Markdown(MarkdownParser):
|
||||
def md_to_html(self, sections):
|
||||
if not sections:
|
||||
@@ -714,6 +719,9 @@ class Markdown(MarkdownParser):
|
||||
def load_images_from_urls(self, urls, cache=None):
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from common.ssrf_guard import assert_url_is_safe, pin_dns
|
||||
|
||||
cache = cache or {}
|
||||
images = []
|
||||
@@ -725,9 +733,40 @@ class Markdown(MarkdownParser):
|
||||
img_obj = None
|
||||
try:
|
||||
if url.startswith(("http://", "https://")):
|
||||
response = requests.get(url, stream=True, timeout=30)
|
||||
if response.status_code == 200 and response.headers.get("Content-Type", "").startswith("image/"):
|
||||
img_obj = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
# SSRF guard: image references come from the (untrusted) uploaded
|
||||
# document, so validate and DNS-pin every hop before connecting.
|
||||
# Otherwise a markdown image like 
|
||||
# would make the server fetch internal services / cloud metadata.
|
||||
# Redirects are followed manually so each hop is re-validated,
|
||||
# mirroring common/data_source/rss_connector.py.
|
||||
current_hostname, current_ip = assert_url_is_safe(url)
|
||||
current_url = url
|
||||
response = None
|
||||
try:
|
||||
for _ in range(MAX_IMAGE_REDIRECTS + 1):
|
||||
# Release the previous hop before opening the next: with
|
||||
# stream=True the connection isn't returned to the pool
|
||||
# until the body is read or the response is closed.
|
||||
if response is not None:
|
||||
response.close()
|
||||
with pin_dns(current_hostname, current_ip):
|
||||
response = requests.get(current_url, stream=True, timeout=30, allow_redirects=False)
|
||||
if response.status_code not in (301, 302, 303, 307, 308):
|
||||
break
|
||||
location = response.headers.get("Location")
|
||||
if not location:
|
||||
break
|
||||
current_url = urljoin(current_url, location)
|
||||
current_hostname, current_ip = assert_url_is_safe(current_url)
|
||||
else:
|
||||
raise ValueError(f"Exceeded {MAX_IMAGE_REDIRECTS} redirects fetching {url!r}")
|
||||
if response.status_code == 200 and response.headers.get("Content-Type", "").startswith("image/"):
|
||||
img_obj = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
finally:
|
||||
# Always release the final/streamed response, including the
|
||||
# non-image and redirect-cap paths where the body is unread.
|
||||
if response is not None:
|
||||
response.close()
|
||||
else:
|
||||
local_path = Path(url)
|
||||
if local_path.exists():
|
||||
|
||||
Reference in New Issue
Block a user