From 0d87cecae2e47b3f9b46836d2c3d06b97f082f4d Mon Sep 17 00:00:00 2001
From: yuch85 <yuch@live.com.sg>
Date: Mon, 27 Apr 2026 11:57:06 +0800
Subject: [PATCH] feat: persist PDF bookmark outline as document metadata
 (#13287)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

PDF files often contain a bookmark/outline tree (table of contents built
into the file by the authoring tool). RAGFlow's `pdf_parser.outlines`
already extracts these `(title, depth)` tuples via pypdf, but they are
used ephemerally during chunking (`manual` parser uses them for
hierarchy detection) and then discarded.

This PR persists the outline as `doc.meta_fields["outline"]` — a JSON
array of `{"title": str, "depth": int}` objects — so downstream features
can use the structural information.

### Why this matters

- **Complementary to `toc_extraction`** — the existing `toc_extraction`
feature uses LLM calls to generate a TOC and only works for the `naive`
parser. The raw PDF outline is free (already extracted by pypdf), works
for all parsers, and captures the author's original document structure.
- **Document navigation** — frontends can render a clickable TOC from
the outline
- **Entity extraction** — the outline provides a structural map for
identifying document sections and key topics
- **Search result context** — knowing which section a chunk belongs to
helps users evaluate relevance

### Changes

| File | Change | LOC |
|------|--------|-----|
| `rag/app/naive.py` | Attach `pdf_parser.outlines` as `__outline__` on
first chunk dict | ~7 |
| `rag/app/manual.py` | Same for the manual parser | ~5 |
| `rag/svr/task_executor.py` | Extract `__outline__`, persist via
`DocMetadataService.update_document_metadata()` | ~12 |

### Design decisions

- **Transient key pattern**: The outline is passed from parser →
task_executor via `__outline__` on the first chunk dict, then removed
before indexing. This follows the same pattern as `metadata_obj` for
LLM-generated metadata.
- **No schema changes**: Uses the existing `meta_fields` JSON column on
the document table.
- **Graceful degradation**: If a PDF has no outline (common for scanned
docs), nothing is stored. If persistence fails, it logs a warning and
continues — parsing is not interrupted.

### Backward compatibility

- **Fully backward compatible** — no existing fields, behavior, or
schemas changed
- PDFs without outlines are unaffected
- Existing `meta_fields` data is preserved (merged, not overwritten)

## Test plan

- [ ] Parse a PDF with bookmarks (e.g. any multi-chapter document),
verify `meta_fields["outline"]` is populated
- [ ] Parse a PDF without bookmarks, verify no errors and no outline key
in meta_fields
- [ ] Verify existing `meta_fields` data is preserved (not overwritten)
when outline is added
- [ ] Verify `manual` parser also persists outlines
- [ ] Verify outline JSON structure: `[{"title": "Chapter 1", "depth":
0}, ...]`

Related: #9921 (Deterministic Document Access Layer)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-authored-by: yuch85 <yuch85.1@gmail.com>
Co-authored-by: Wang Qi <wangq8@outlook.com>
---
 rag/app/manual.py        |  5 +++++
 rag/app/naive.py         |  9 +++++++++
 rag/svr/task_executor.py | 13 +++++++++++++
 3 files changed, 27 insertions(+)

diff --git a/rag/app/manual.py b/rag/app/manual.py
index 7e6eaf2d7e..cb946d49ac 100644
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@@ -267,6 +267,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
         image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0))
         if table_ctx or image_ctx:
             attach_media_context(res, table_ctx, image_ctx)
+        if res and pdf_parser and getattr(pdf_parser, "outlines", None):
+            res[0]["__outline__"] = [
+                {"title": title, "depth": depth}
+                for title, depth in pdf_parser.outlines
+            ]
         return res
 
     elif re.search(r"\.docx?$", filename, re.IGNORECASE):
diff --git a/rag/app/naive.py b/rag/app/naive.py
index b022ec17c2..9218c20c1e 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -1127,6 +1127,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
         res.extend(url_res)
     # if table_context_size or image_context_size:
     #    attach_media_context(res, table_context_size, image_context_size)
+
+    # Attach PDF outline as transient metadata on the first chunk.
+    # task_executor.py will extract and persist it as document metadata.
+    if res and pdf_parser and getattr(pdf_parser, "outlines", None):
+        res[0]["__outline__"] = [
+            {"title": title, "depth": depth}
+            for title, depth in pdf_parser.outlines
+        ]
+
     return res
 
 
diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py
index 5f8305176c..94ad77a0b2 100644
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@@ -290,6 +290,19 @@ async def build_chunks(task, progress_callback):
         logging.exception("Chunking {}/{} got exception".format(task["location"], task["name"]))
         raise
 
+    # Extract and persist PDF outline if the parser attached it.
+    if cks and cks[0].get("__outline__"):
+        outline = cks[0].pop("__outline__")
+        try:
+            DocMetadataService.update_document_metadata(
+                task["doc_id"],
+                update_metadata_to({"outline": outline},
+                                   DocMetadataService.get_document_metadata(task["doc_id"]) or {})
+            )
+            logging.info("Persisted PDF outline (%d entries) for doc %s", len(outline), task["doc_id"])
+        except Exception as e:
+            logging.warning("Failed to persist PDF outline for doc %s: %s", task["doc_id"], e)
+
     docs = []
     doc = {
         "doc_id": task["doc_id"],