Fix: SSRF in markdown parser remote image fetch (#15438)

### What problem does this PR solve? `rag/app/naive.py` `Markdown.load_images_from_urls` fetched image URLs parsed straight out of an untrusted uploaded markdown document via a raw `requests.get`, with no SSRF validation. Markdown chunking always reaches this path (`return_section_images=True`), so any authenticated user who uploads a `.md`/`.markdown`/`.mdx` file to a knowledge base could make the server issue requests to internal services or cloud-metadata endpoints, e.g. `![x](http://169.254.169.254/latest/meta-data/...)`. The `image/` Content-Type check only gates decoding — the outbound request (the SSRF) always fires. This was the one user-controlled fetch site missed by the project's existing SSRF-hardening (`common/ssrf_guard.py`, already applied to the crawler, SearXNG, RSS connector, MCP/document APIs, and OAuth avatar download). The fix validates and DNS-pins every hop with `common.ssrf_guard.assert_url_is_safe` before connecting, and follows redirects manually so each redirect target is re-validated (closing the DNS-rebinding / redirect-bypass window), mirroring `common/data_source/rss_connector.py`. Blocked URLs are skipped and logged like any other unreachable image, so legitimate public images are unaffected. Adds a regression test at `test/unit_test/rag/app/test_markdown_image_ssrf.py`. Closes #15437 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --------- Co-authored-by: Ubuntu <ubuntu@ubuntu-2204.linuxvmimages.local> Co-authored-by: galuis116 <galuis116@users.noreply.github.com>
2026-06-29 15:31:05 +08:00 · 2026-06-16 03:54:55 -07:00
parent abca767103
commit 6bfaa3f21e
2 changed files with 182 additions and 3 deletions
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -643,6 +643,11 @@ class Pdf(PdfParser):
            return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls


+# Maximum number of HTTP redirects followed when fetching a remote image
+# referenced by a markdown document (each hop is SSRF-validated).
+MAX_IMAGE_REDIRECTS = 5
+
+
 class Markdown(MarkdownParser):
    def md_to_html(self, sections):
        if not sections:
@@ -714,6 +719,9 @@ class Markdown(MarkdownParser):
    def load_images_from_urls(self, urls, cache=None):
        import requests
        from pathlib import Path
+        from urllib.parse import urljoin
+
+        from common.ssrf_guard import assert_url_is_safe, pin_dns

        cache = cache or {}
        images = []
@@ -725,9 +733,40 @@ class Markdown(MarkdownParser):
            img_obj = None
            try:
                if url.startswith(("http://", "https://")):
-                    response = requests.get(url, stream=True, timeout=30)
-                    if response.status_code == 200 and response.headers.get("Content-Type", "").startswith("image/"):
-                        img_obj = Image.open(BytesIO(response.content)).convert("RGB")
+                    # SSRF guard: image references come from the (untrusted) uploaded
+                    # document, so validate and DNS-pin every hop before connecting.
+                    # Otherwise a markdown image like ![x](http://169.254.169.254/...)
+                    # would make the server fetch internal services / cloud metadata.
+                    # Redirects are followed manually so each hop is re-validated,
+                    # mirroring common/data_source/rss_connector.py.
+                    current_hostname, current_ip = assert_url_is_safe(url)
+                    current_url = url
+                    response = None
+                    try:
+                        for _ in range(MAX_IMAGE_REDIRECTS + 1):
+                            # Release the previous hop before opening the next: with
+                            # stream=True the connection isn't returned to the pool
+                            # until the body is read or the response is closed.
+                            if response is not None:
+                                response.close()
+                            with pin_dns(current_hostname, current_ip):
+                                response = requests.get(current_url, stream=True, timeout=30, allow_redirects=False)
+                            if response.status_code not in (301, 302, 303, 307, 308):
+                                break
+                            location = response.headers.get("Location")
+                            if not location:
+                                break
+                            current_url = urljoin(current_url, location)
+                            current_hostname, current_ip = assert_url_is_safe(current_url)
+                        else:
+                            raise ValueError(f"Exceeded {MAX_IMAGE_REDIRECTS} redirects fetching {url!r}")
+                        if response.status_code == 200 and response.headers.get("Content-Type", "").startswith("image/"):
+                            img_obj = Image.open(BytesIO(response.content)).convert("RGB")
+                    finally:
+                        # Always release the final/streamed response, including the
+                        # non-image and redirect-cap paths where the body is unread.
+                        if response is not None:
+                            response.close()
                else:
                    local_path = Path(url)
                    if local_path.exists():