mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
Fix: SSRF in markdown parser remote image fetch (#15438)
### What problem does this PR solve? `rag/app/naive.py` `Markdown.load_images_from_urls` fetched image URLs parsed straight out of an untrusted uploaded markdown document via a raw `requests.get`, with no SSRF validation. Markdown chunking always reaches this path (`return_section_images=True`), so any authenticated user who uploads a `.md`/`.markdown`/`.mdx` file to a knowledge base could make the server issue requests to internal services or cloud-metadata endpoints, e.g. ``. The `image/` Content-Type check only gates decoding — the outbound request (the SSRF) always fires. This was the one user-controlled fetch site missed by the project's existing SSRF-hardening (`common/ssrf_guard.py`, already applied to the crawler, SearXNG, RSS connector, MCP/document APIs, and OAuth avatar download). The fix validates and DNS-pins every hop with `common.ssrf_guard.assert_url_is_safe` before connecting, and follows redirects manually so each redirect target is re-validated (closing the DNS-rebinding / redirect-bypass window), mirroring `common/data_source/rss_connector.py`. Blocked URLs are skipped and logged like any other unreachable image, so legitimate public images are unaffected. Adds a regression test at `test/unit_test/rag/app/test_markdown_image_ssrf.py`. Closes #15437 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --------- Co-authored-by: Ubuntu <ubuntu@ubuntu-2204.linuxvmimages.local> Co-authored-by: galuis116 <galuis116@users.noreply.github.com>
This commit is contained in:
@@ -643,6 +643,11 @@ class Pdf(PdfParser):
|
||||
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
|
||||
|
||||
|
||||
# Maximum number of HTTP redirects followed when fetching a remote image
|
||||
# referenced by a markdown document (each hop is SSRF-validated).
|
||||
MAX_IMAGE_REDIRECTS = 5
|
||||
|
||||
|
||||
class Markdown(MarkdownParser):
|
||||
def md_to_html(self, sections):
|
||||
if not sections:
|
||||
@@ -714,6 +719,9 @@ class Markdown(MarkdownParser):
|
||||
def load_images_from_urls(self, urls, cache=None):
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin
|
||||
|
||||
from common.ssrf_guard import assert_url_is_safe, pin_dns
|
||||
|
||||
cache = cache or {}
|
||||
images = []
|
||||
@@ -725,9 +733,40 @@ class Markdown(MarkdownParser):
|
||||
img_obj = None
|
||||
try:
|
||||
if url.startswith(("http://", "https://")):
|
||||
response = requests.get(url, stream=True, timeout=30)
|
||||
if response.status_code == 200 and response.headers.get("Content-Type", "").startswith("image/"):
|
||||
img_obj = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
# SSRF guard: image references come from the (untrusted) uploaded
|
||||
# document, so validate and DNS-pin every hop before connecting.
|
||||
# Otherwise a markdown image like 
|
||||
# would make the server fetch internal services / cloud metadata.
|
||||
# Redirects are followed manually so each hop is re-validated,
|
||||
# mirroring common/data_source/rss_connector.py.
|
||||
current_hostname, current_ip = assert_url_is_safe(url)
|
||||
current_url = url
|
||||
response = None
|
||||
try:
|
||||
for _ in range(MAX_IMAGE_REDIRECTS + 1):
|
||||
# Release the previous hop before opening the next: with
|
||||
# stream=True the connection isn't returned to the pool
|
||||
# until the body is read or the response is closed.
|
||||
if response is not None:
|
||||
response.close()
|
||||
with pin_dns(current_hostname, current_ip):
|
||||
response = requests.get(current_url, stream=True, timeout=30, allow_redirects=False)
|
||||
if response.status_code not in (301, 302, 303, 307, 308):
|
||||
break
|
||||
location = response.headers.get("Location")
|
||||
if not location:
|
||||
break
|
||||
current_url = urljoin(current_url, location)
|
||||
current_hostname, current_ip = assert_url_is_safe(current_url)
|
||||
else:
|
||||
raise ValueError(f"Exceeded {MAX_IMAGE_REDIRECTS} redirects fetching {url!r}")
|
||||
if response.status_code == 200 and response.headers.get("Content-Type", "").startswith("image/"):
|
||||
img_obj = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
finally:
|
||||
# Always release the final/streamed response, including the
|
||||
# non-image and redirect-cap paths where the body is unread.
|
||||
if response is not None:
|
||||
response.close()
|
||||
else:
|
||||
local_path = Path(url)
|
||||
if local_path.exists():
|
||||
|
||||
140
test/unit_test/rag/app/test_markdown_image_ssrf.py
Normal file
140
test/unit_test/rag/app/test_markdown_image_ssrf.py
Normal file
@@ -0,0 +1,140 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
#
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License on an "AS IS" BASIS, WITHOUT WARRANTIES
|
||||
# OR CONDITIONS OF ANY KIND, either express or implied. See the License
|
||||
# for the specific language governing permissions and limitations under
|
||||
# the License.
|
||||
#
|
||||
|
||||
"""SSRF-guard regression tests for rag.app.naive.Markdown.load_images_from_urls.
|
||||
|
||||
Image references are parsed out of the (untrusted) uploaded markdown document
|
||||
and fetched server-side, so the loader must validate + DNS-pin every hop before
|
||||
connecting. These tests assert that internal/loopback targets are rejected,
|
||||
that redirects to internal targets are rejected, and that a legitimate public
|
||||
image is still fetched.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import sys
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
# Mock heavy modules that trigger ONNX/OCR model loading or optional parser
|
||||
# backends at import time so importing rag.app.naive stays lightweight.
|
||||
for _mod in [
|
||||
"deepdoc.vision.ocr",
|
||||
"deepdoc.parser.figure_parser",
|
||||
"deepdoc.parser.docling_parser",
|
||||
"deepdoc.parser.tcadp_parser",
|
||||
"rag.app.picture",
|
||||
]:
|
||||
if _mod not in sys.modules:
|
||||
sys.modules[_mod] = MagicMock()
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
from common import ssrf_guard
|
||||
from rag.app.naive import MAX_IMAGE_REDIRECTS, Markdown
|
||||
|
||||
|
||||
def _png_bytes() -> bytes:
|
||||
buf = io.BytesIO()
|
||||
Image.new("RGB", (1, 1), (255, 0, 0)).save(buf, format="PNG")
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
class _Resp:
|
||||
"""Minimal stand-in for requests.Response."""
|
||||
|
||||
def __init__(self, status_code, headers=None, content=b""):
|
||||
self.status_code = status_code
|
||||
self.headers = headers or {}
|
||||
self.content = content
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def parser():
|
||||
return Markdown(128)
|
||||
|
||||
|
||||
@pytest.mark.p1
|
||||
def test_blocks_internal_url_without_fetching(parser):
|
||||
"""A markdown image pointing at an internal host must never be requested."""
|
||||
with (
|
||||
patch.object(ssrf_guard, "assert_url_is_safe", side_effect=ValueError("non-public")) as guard,
|
||||
patch("requests.get") as get,
|
||||
):
|
||||
images, cache = parser.load_images_from_urls(["http://169.254.169.254/latest/meta-data/"])
|
||||
|
||||
guard.assert_called_once()
|
||||
get.assert_not_called() # SSRF guard rejects before any connection is made
|
||||
assert images == []
|
||||
assert cache["http://169.254.169.254/latest/meta-data/"] is None
|
||||
|
||||
|
||||
@pytest.mark.p1
|
||||
def test_blocks_redirect_to_internal_target(parser):
|
||||
"""A public URL that 302-redirects to a loopback target must be rejected."""
|
||||
|
||||
def selective_assert(url, **kwargs):
|
||||
if "127.0.0.1" in url or "localhost" in url:
|
||||
raise ValueError("redirect resolves to non-public address")
|
||||
return ("public.example", "8.8.8.8")
|
||||
|
||||
redirect = _Resp(302, headers={"Location": "http://127.0.0.1/secret"})
|
||||
with (
|
||||
patch.object(ssrf_guard, "assert_url_is_safe", side_effect=selective_assert),
|
||||
patch("requests.get", return_value=redirect) as get,
|
||||
):
|
||||
images, _ = parser.load_images_from_urls(["http://public.example/logo.png"])
|
||||
|
||||
# Only the first (public) hop is fetched; the redirect target is blocked
|
||||
# by re-validation before a second request is made.
|
||||
assert get.call_count == 1
|
||||
assert images == []
|
||||
|
||||
|
||||
@pytest.mark.p1
|
||||
def test_fetches_legitimate_public_image(parser):
|
||||
png = _png_bytes()
|
||||
ok = _Resp(200, headers={"Content-Type": "image/png"}, content=png)
|
||||
with (
|
||||
patch.object(ssrf_guard, "assert_url_is_safe", return_value=("public.example", "8.8.8.8")),
|
||||
patch("requests.get", return_value=ok) as get,
|
||||
):
|
||||
images, cache = parser.load_images_from_urls(["http://public.example/logo.png"])
|
||||
|
||||
get.assert_called_once()
|
||||
# allow_redirects must be disabled so redirects are validated per hop.
|
||||
assert get.call_args.kwargs.get("allow_redirects") is False
|
||||
assert len(images) == 1
|
||||
assert isinstance(images[0], Image.Image)
|
||||
|
||||
|
||||
@pytest.mark.p1
|
||||
def test_redirect_chain_is_bounded(parser):
|
||||
"""An endless redirect loop is abandoned instead of being followed forever."""
|
||||
loop = _Resp(302, headers={"Location": "http://public.example/next"})
|
||||
with (
|
||||
patch.object(ssrf_guard, "assert_url_is_safe", return_value=("public.example", "8.8.8.8")),
|
||||
patch("requests.get", return_value=loop) as get,
|
||||
):
|
||||
images, _ = parser.load_images_from_urls(["http://public.example/start"])
|
||||
|
||||
assert get.call_count == MAX_IMAGE_REDIRECTS + 1
|
||||
assert images == []
|
||||
Reference in New Issue
Block a user