From fe46244d30c569ef16ac7a0f913571b97f4a97a3 Mon Sep 17 00:00:00 2001
From: euvre <93761161+euvre@users.noreply.github.com>
Date: Tue, 16 Jun 2026 18:33:53 -0700
Subject: [PATCH] fix: paginate non-DeepDOC PDF parsing tasks to prevent OOM
 (#16106)

The parser pods suffer from OOM kills when processing large PDF
documents. The root cause is in api/db/services/task_service.py: when
layout_recognize is not DeepDOC (e.g. Plain Text), page_size was set to
MAXIMUM_TASK_PAGE_NUMBER (100 million), causing the entire PDF to be
processed as a single task with all pages loaded into memory
simultaneously.

This PR fixes the issue by paginating non-DeepDOC PDF parsing tasks the
same way DeepDOC already does.
---
 api/db/services/task_service.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/api/db/services/task_service.py b/api/db/services/task_service.py
index 10ff8c7c29..fb765a9741 100644
--- a/api/db/services/task_service.py
+++ b/api/db/services/task_service.py
@@ -390,14 +390,13 @@ def queue_tasks(doc: dict, bucket: str, name: str, priority: int):
 
     if doc["type"] == FileType.PDF.value:
         file_bin = settings.STORAGE_IMPL.get(bucket, name)
-        do_layout = doc["parser_config"].get("layout_recognize", "DeepDOC")
         pages = PdfParser.total_page_number(doc["name"], file_bin)
         if pages is None:
             pages = 0
         page_size = doc["parser_config"].get("task_page_size") or 12
         if doc["parser_id"] == "paper":
             page_size = doc["parser_config"].get("task_page_size") or 22
-        if doc["parser_id"] in ["one", "knowledge_graph"] or do_layout != "DeepDOC" or doc["parser_config"].get("toc_extraction", False):
+        if doc["parser_id"] in ["one", "knowledge_graph"] or doc["parser_config"].get("toc_extraction", False):
             page_size = MAXIMUM_TASK_PAGE_NUMBER
         page_ranges = doc["parser_config"].get("pages") or [(1, MAXIMUM_PAGE_NUMBER)]
         for s, e in page_ranges: