mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
Feat: Refact pipeline (#13826)
### What problem does this PR solve? ### Type of change - [x] New Feature (non-breaking change which adds functionality) - [x] Refactoring --------- Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -14,6 +14,10 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
from pypdf import PdfReader as pdf2_read
|
||||
|
||||
from rag.nlp import find_codec
|
||||
|
||||
|
||||
@@ -30,3 +34,21 @@ def get_text(fnm: str, binary=None) -> str:
|
||||
break
|
||||
txt += line
|
||||
return txt
|
||||
|
||||
|
||||
def extract_pdf_outlines(source):
|
||||
try:
|
||||
with pdf2_read(source if isinstance(source, str) else BytesIO(source)) as pdf:
|
||||
outlines = []
|
||||
|
||||
def dfs(nodes, depth):
|
||||
for node in nodes:
|
||||
if isinstance(node, list):
|
||||
dfs(node, depth + 1)
|
||||
else:
|
||||
outlines.append((node["/Title"], depth, pdf.get_destination_page_number(node) + 1))
|
||||
|
||||
dfs(pdf.outline, 0)
|
||||
return outlines
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
Reference in New Issue
Block a user