From 0d87cecae2e47b3f9b46836d2c3d06b97f082f4d Mon Sep 17 00:00:00 2001 From: yuch85 Date: Mon, 27 Apr 2026 11:57:06 +0800 Subject: [PATCH] feat: persist PDF bookmark outline as document metadata (#13287) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary PDF files often contain a bookmark/outline tree (table of contents built into the file by the authoring tool). RAGFlow's `pdf_parser.outlines` already extracts these `(title, depth)` tuples via pypdf, but they are used ephemerally during chunking (`manual` parser uses them for hierarchy detection) and then discarded. This PR persists the outline as `doc.meta_fields["outline"]` — a JSON array of `{"title": str, "depth": int}` objects — so downstream features can use the structural information. ### Why this matters - **Complementary to `toc_extraction`** — the existing `toc_extraction` feature uses LLM calls to generate a TOC and only works for the `naive` parser. The raw PDF outline is free (already extracted by pypdf), works for all parsers, and captures the author's original document structure. - **Document navigation** — frontends can render a clickable TOC from the outline - **Entity extraction** — the outline provides a structural map for identifying document sections and key topics - **Search result context** — knowing which section a chunk belongs to helps users evaluate relevance ### Changes | File | Change | LOC | |------|--------|-----| | `rag/app/naive.py` | Attach `pdf_parser.outlines` as `__outline__` on first chunk dict | ~7 | | `rag/app/manual.py` | Same for the manual parser | ~5 | | `rag/svr/task_executor.py` | Extract `__outline__`, persist via `DocMetadataService.update_document_metadata()` | ~12 | ### Design decisions - **Transient key pattern**: The outline is passed from parser → task_executor via `__outline__` on the first chunk dict, then removed before indexing. This follows the same pattern as `metadata_obj` for LLM-generated metadata. - **No schema changes**: Uses the existing `meta_fields` JSON column on the document table. - **Graceful degradation**: If a PDF has no outline (common for scanned docs), nothing is stored. If persistence fails, it logs a warning and continues — parsing is not interrupted. ### Backward compatibility - **Fully backward compatible** — no existing fields, behavior, or schemas changed - PDFs without outlines are unaffected - Existing `meta_fields` data is preserved (merged, not overwritten) ## Test plan - [ ] Parse a PDF with bookmarks (e.g. any multi-chapter document), verify `meta_fields["outline"]` is populated - [ ] Parse a PDF without bookmarks, verify no errors and no outline key in meta_fields - [ ] Verify existing `meta_fields` data is preserved (not overwritten) when outline is added - [ ] Verify `manual` parser also persists outlines - [ ] Verify outline JSON structure: `[{"title": "Chapter 1", "depth": 0}, ...]` Related: #9921 (Deterministic Document Access Layer) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: yuch85 Co-authored-by: Wang Qi --- rag/app/manual.py | 5 +++++ rag/app/naive.py | 9 +++++++++ rag/svr/task_executor.py | 13 +++++++++++++ 3 files changed, 27 insertions(+) diff --git a/rag/app/manual.py b/rag/app/manual.py index 7e6eaf2d7e..cb946d49ac 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -267,6 +267,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca image_ctx = max(0, int(parser_config.get("image_context_size", 0) or 0)) if table_ctx or image_ctx: attach_media_context(res, table_ctx, image_ctx) + if res and pdf_parser and getattr(pdf_parser, "outlines", None): + res[0]["__outline__"] = [ + {"title": title, "depth": depth} + for title, depth in pdf_parser.outlines + ] return res elif re.search(r"\.docx?$", filename, re.IGNORECASE): diff --git a/rag/app/naive.py b/rag/app/naive.py index b022ec17c2..9218c20c1e 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -1127,6 +1127,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca res.extend(url_res) # if table_context_size or image_context_size: # attach_media_context(res, table_context_size, image_context_size) + + # Attach PDF outline as transient metadata on the first chunk. + # task_executor.py will extract and persist it as document metadata. + if res and pdf_parser and getattr(pdf_parser, "outlines", None): + res[0]["__outline__"] = [ + {"title": title, "depth": depth} + for title, depth in pdf_parser.outlines + ] + return res diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 5f8305176c..94ad77a0b2 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -290,6 +290,19 @@ async def build_chunks(task, progress_callback): logging.exception("Chunking {}/{} got exception".format(task["location"], task["name"])) raise + # Extract and persist PDF outline if the parser attached it. + if cks and cks[0].get("__outline__"): + outline = cks[0].pop("__outline__") + try: + DocMetadataService.update_document_metadata( + task["doc_id"], + update_metadata_to({"outline": outline}, + DocMetadataService.get_document_metadata(task["doc_id"]) or {}) + ) + logging.info("Persisted PDF outline (%d entries) for doc %s", len(outline), task["doc_id"]) + except Exception as e: + logging.warning("Failed to persist PDF outline for doc %s: %s", task["doc_id"], e) + docs = [] doc = { "doc_id": task["doc_id"],