diff --git a/rag/app/manual.py b/rag/app/manual.py index 7e6eaf2d7e..cb946d49ac 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -267,6 +267,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0)) if table_ctx or image_ctx: attach_media_context(res, table_ctx, image_ctx) + if res and pdf_parser and getattr(pdf_parser, "outlines", None): + res[0]["__outline__"] = [ + {"title": title, "depth": depth} + for title, depth in pdf_parser.outlines + ] return res elif re.search(r"\.docx?$", filename, re.IGNORECASE): diff --git a/rag/app/naive.py b/rag/app/naive.py index b022ec17c2..9218c20c1e 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -1127,6 +1127,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca res.extend(url_res) # if table_context_size or image_context_size: # attach_media_context(res, table_context_size, image_context_size) + + # Attach PDF outline as transient metadata on the first chunk. + # task_executor.py will extract and persist it as document metadata. + if res and pdf_parser and getattr(pdf_parser, "outlines", None): + res[0]["__outline__"] = [ + {"title": title, "depth": depth} + for title, depth in pdf_parser.outlines + ] + return res diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 5f8305176c..94ad77a0b2 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -290,6 +290,19 @@ async def build_chunks(task, progress_callback): logging.exception("Chunking {}/{} got exception".format(task["location"], task["name"])) raise + # Extract and persist PDF outline if the parser attached it. + if cks and cks[0].get("__outline__"): + outline = cks[0].pop("__outline__") + try: + DocMetadataService.update_document_metadata( + task["doc_id"], + update_metadata_to({"outline": outline}, + DocMetadataService.get_document_metadata(task["doc_id"]) or {}) + ) + logging.info("Persisted PDF outline (%d entries) for doc %s", len(outline), task["doc_id"]) + except Exception as e: + logging.warning("Failed to persist PDF outline for doc %s: %s", task["doc_id"], e) + docs = [] doc = { "doc_id": task["doc_id"],