2024-05-31 09:53:04 +08:00
|
|
|
|
#
|
|
|
|
|
|
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
|
|
|
|
|
|
#
|
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
|
#
|
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
#
|
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
|
#
|
2025-10-10 17:07:55 +08:00
|
|
|
|
import json
|
2024-11-14 17:13:48 +08:00
|
|
|
|
import logging
|
2023-12-25 19:05:59 +08:00
|
|
|
|
import re
|
2025-04-22 15:16:04 +08:00
|
|
|
|
import math
|
2025-12-01 14:03:09 +08:00
|
|
|
|
from collections import OrderedDict, defaultdict
|
2023-12-25 19:05:59 +08:00
|
|
|
|
from dataclasses import dataclass
|
2024-01-17 09:39:50 +08:00
|
|
|
|
|
2024-11-12 14:59:41 +08:00
|
|
|
|
from rag.nlp import rag_tokenizer, query
|
2023-12-25 19:05:59 +08:00
|
|
|
|
import numpy as np
|
2025-12-25 21:18:13 +08:00
|
|
|
|
from common.doc_store.doc_store_base import MatchDenseExpr, FusionExpr, OrderByExpr, DocStoreConnection
|
2025-10-28 09:46:32 +08:00
|
|
|
|
from common.string_utils import remove_redundant_spaces
|
|
|
|
|
|
from common.float_utils import get_float
|
2025-11-06 09:36:38 +08:00
|
|
|
|
from common.constants import PAGERANK_FLD, TAG_FLD
|
2026-04-15 16:31:11 +08:00
|
|
|
|
from common.tag_feature_utils import parse_tag_features
|
2025-11-26 11:06:37 +08:00
|
|
|
|
from common import settings
|
2023-12-25 19:05:59 +08:00
|
|
|
|
|
2026-01-20 13:29:37 +08:00
|
|
|
|
from common.misc_utils import thread_pool_exec
|
2023-12-28 13:50:13 +08:00
|
|
|
|
|
2024-01-15 19:47:25 +08:00
|
|
|
|
def index_name(uid): return f"ragflow_{uid}"
|
2023-12-28 13:50:13 +08:00
|
|
|
|
|
2023-12-26 19:32:06 +08:00
|
|
|
|
|
2023-12-25 19:05:59 +08:00
|
|
|
|
class Dealer:
|
2024-11-12 14:59:41 +08:00
|
|
|
|
def __init__(self, dataStore: DocStoreConnection):
|
|
|
|
|
|
self.qryr = query.FulltextQueryer()
|
|
|
|
|
|
self.dataStore = dataStore
|
2023-12-25 19:05:59 +08:00
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
|
|
class SearchResult:
|
2023-12-28 13:50:13 +08:00
|
|
|
|
total: int
|
2024-11-18 17:38:17 +08:00
|
|
|
|
ids: list[str]
|
|
|
|
|
|
query_vector: list[float] | None = None
|
|
|
|
|
|
field: dict | None = None
|
|
|
|
|
|
highlight: dict | None = None
|
|
|
|
|
|
aggregation: list | dict | None = None
|
|
|
|
|
|
keywords: list[str] | None = None
|
|
|
|
|
|
group_docs: list[list] | None = None
|
2023-12-25 19:05:59 +08:00
|
|
|
|
|
2026-01-15 12:28:49 +08:00
|
|
|
|
async def get_vector(self, txt, emb_mdl, topk=10, similarity=0.1):
|
2026-01-20 13:29:37 +08:00
|
|
|
|
qv, _ = await thread_pool_exec(emb_mdl.encode_queries, txt)
|
2024-11-28 14:10:22 +08:00
|
|
|
|
shape = np.array(qv).shape
|
|
|
|
|
|
if len(shape) > 1:
|
2025-01-09 17:07:21 +08:00
|
|
|
|
raise Exception(
|
|
|
|
|
|
f"Dealer.get_vector returned array's shape {shape} doesn't match expectation(exact one dimension).")
|
2025-03-18 11:13:44 +08:00
|
|
|
|
embedding_data = [get_float(v) for v in qv]
|
2024-11-12 14:59:41 +08:00
|
|
|
|
vector_column_name = f"q_{len(embedding_data)}_vec"
|
|
|
|
|
|
return MatchDenseExpr(vector_column_name, embedding_data, 'float', 'cosine', topk, {"similarity": similarity})
|
|
|
|
|
|
|
|
|
|
|
|
def get_filters(self, req):
|
|
|
|
|
|
condition = dict()
|
|
|
|
|
|
for key, field in {"kb_ids": "kb_id", "doc_ids": "doc_id"}.items():
|
|
|
|
|
|
if key in req and req[key] is not None:
|
|
|
|
|
|
condition[field] = req[key]
|
|
|
|
|
|
# TODO(yzc): `available_int` is nullable however infinity doesn't support nullable columns.
|
2025-12-29 12:01:18 +08:00
|
|
|
|
for key in ["knowledge_graph_kwd", "available_int", "entity_kwd", "from_entity_kwd", "to_entity_kwd",
|
|
|
|
|
|
"removed_kwd"]:
|
2024-11-12 14:59:41 +08:00
|
|
|
|
if key in req and req[key] is not None:
|
|
|
|
|
|
condition[key] = req[key]
|
|
|
|
|
|
return condition
|
|
|
|
|
|
|
2026-01-15 12:28:49 +08:00
|
|
|
|
async def search(self, req, idx_names: str | list[str],
|
2025-01-09 17:07:21 +08:00
|
|
|
|
kb_ids: list[str],
|
|
|
|
|
|
emb_mdl=None,
|
2025-10-27 09:29:39 +08:00
|
|
|
|
highlight: bool | list | None = None,
|
2025-01-09 17:07:21 +08:00
|
|
|
|
rank_feature: dict | None = None
|
|
|
|
|
|
):
|
2025-10-27 09:29:39 +08:00
|
|
|
|
if highlight is None:
|
|
|
|
|
|
highlight = False
|
|
|
|
|
|
|
2024-11-12 14:59:41 +08:00
|
|
|
|
filters = self.get_filters(req)
|
|
|
|
|
|
orderBy = OrderByExpr()
|
2023-12-25 19:05:59 +08:00
|
|
|
|
|
2023-12-28 13:50:13 +08:00
|
|
|
|
pg = int(req.get("page", 1)) - 1
|
2024-03-22 19:21:09 +08:00
|
|
|
|
topk = int(req.get("topk", 1024))
|
2024-05-29 16:50:02 +08:00
|
|
|
|
ps = int(req.get("size", topk))
|
2024-12-23 12:12:15 +08:00
|
|
|
|
offset, limit = pg * ps, ps
|
2024-11-12 14:59:41 +08:00
|
|
|
|
|
2025-01-09 17:07:21 +08:00
|
|
|
|
src = req.get("fields",
|
|
|
|
|
|
["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
|
2026-04-03 19:26:45 +08:00
|
|
|
|
"doc_id", "chunk_order_int", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd",
|
2025-05-13 14:30:36 +08:00
|
|
|
|
"question_kwd", "question_tks", "doc_type_kwd",
|
2026-04-02 18:56:43 +08:00
|
|
|
|
"available_int", "content_with_weight", "mom_id", PAGERANK_FLD, TAG_FLD, "row_id()"])
|
2024-11-12 14:59:41 +08:00
|
|
|
|
kwds = set([])
|
2023-12-25 19:05:59 +08:00
|
|
|
|
|
2024-11-12 14:59:41 +08:00
|
|
|
|
qst = req.get("question", "")
|
|
|
|
|
|
q_vec = []
|
2023-12-28 13:50:13 +08:00
|
|
|
|
if not qst:
|
2024-11-12 14:59:41 +08:00
|
|
|
|
if req.get("sort"):
|
2026-04-03 19:26:45 +08:00
|
|
|
|
orderBy.asc("chunk_order_int")
|
2024-12-10 16:32:58 +08:00
|
|
|
|
orderBy.asc("page_num_int")
|
|
|
|
|
|
orderBy.asc("top_int")
|
2024-11-12 14:59:41 +08:00
|
|
|
|
orderBy.desc("create_timestamp_flt")
|
|
|
|
|
|
res = self.dataStore.search(src, [], filters, [], orderBy, offset, limit, idx_names, kb_ids)
|
2025-11-12 19:00:15 +08:00
|
|
|
|
total = self.dataStore.get_total(res)
|
2024-11-14 17:13:48 +08:00
|
|
|
|
logging.debug("Dealer.search TOTAL: {}".format(total))
|
2024-11-12 14:59:41 +08:00
|
|
|
|
else:
|
2025-10-16 18:45:43 +08:00
|
|
|
|
highlightFields = ["content_ltks", "title_tks"]
|
|
|
|
|
|
if not highlight:
|
|
|
|
|
|
highlightFields = []
|
|
|
|
|
|
elif isinstance(highlight, list):
|
|
|
|
|
|
highlightFields = highlight
|
2024-11-12 14:59:41 +08:00
|
|
|
|
matchText, keywords = self.qryr.question(qst, min_match=0.3)
|
|
|
|
|
|
if emb_mdl is None:
|
|
|
|
|
|
matchExprs = [matchText]
|
2026-01-20 13:29:37 +08:00
|
|
|
|
res = await thread_pool_exec(self.dataStore.search, src, highlightFields, filters, matchExprs, orderBy, offset, limit,
|
2025-01-09 17:07:21 +08:00
|
|
|
|
idx_names, kb_ids, rank_feature=rank_feature)
|
2025-11-12 19:00:15 +08:00
|
|
|
|
total = self.dataStore.get_total(res)
|
2024-11-14 17:13:48 +08:00
|
|
|
|
logging.debug("Dealer.search TOTAL: {}".format(total))
|
2024-03-04 14:42:26 +08:00
|
|
|
|
else:
|
2026-01-15 12:28:49 +08:00
|
|
|
|
matchDense = await self.get_vector(qst, emb_mdl, topk, req.get("similarity", 0.1))
|
2024-11-12 14:59:41 +08:00
|
|
|
|
q_vec = matchDense.embedding_data
|
2025-11-26 11:06:37 +08:00
|
|
|
|
if not settings.DOC_ENGINE_INFINITY:
|
|
|
|
|
|
src.append(f"q_{len(q_vec)}_vec")
|
2024-11-12 14:59:41 +08:00
|
|
|
|
|
2025-07-29 09:14:23 +08:00
|
|
|
|
fusionExpr = FusionExpr("weighted_sum", topk, {"weights": "0.05,0.95"})
|
2024-11-12 14:59:41 +08:00
|
|
|
|
matchExprs = [matchText, matchDense, fusionExpr]
|
|
|
|
|
|
|
2026-01-20 13:29:37 +08:00
|
|
|
|
res = await thread_pool_exec(self.dataStore.search, src, highlightFields, filters, matchExprs, orderBy, offset, limit,
|
2025-01-09 17:07:21 +08:00
|
|
|
|
idx_names, kb_ids, rank_feature=rank_feature)
|
2025-11-12 19:00:15 +08:00
|
|
|
|
total = self.dataStore.get_total(res)
|
2024-11-14 17:13:48 +08:00
|
|
|
|
logging.debug("Dealer.search TOTAL: {}".format(total))
|
2024-11-12 14:59:41 +08:00
|
|
|
|
|
|
|
|
|
|
# If result is empty, try again with lower min_match
|
|
|
|
|
|
if total == 0:
|
2025-03-26 15:37:48 +08:00
|
|
|
|
if filters.get("doc_id"):
|
2026-01-20 13:29:37 +08:00
|
|
|
|
res = await thread_pool_exec(self.dataStore.search, src, [], filters, [], orderBy, offset, limit, idx_names, kb_ids)
|
2025-11-12 19:00:15 +08:00
|
|
|
|
total = self.dataStore.get_total(res)
|
2025-03-26 15:37:48 +08:00
|
|
|
|
else:
|
|
|
|
|
|
matchText, _ = self.qryr.question(qst, min_match=0.1)
|
|
|
|
|
|
matchDense.extra_options["similarity"] = 0.17
|
2026-01-20 13:29:37 +08:00
|
|
|
|
res = await thread_pool_exec(self.dataStore.search, src, highlightFields, filters, [matchText, matchDense, fusionExpr],
|
2025-12-29 12:01:18 +08:00
|
|
|
|
orderBy, offset, limit, idx_names, kb_ids,
|
|
|
|
|
|
rank_feature=rank_feature)
|
2025-11-12 19:00:15 +08:00
|
|
|
|
total = self.dataStore.get_total(res)
|
2025-07-08 12:32:01 +08:00
|
|
|
|
logging.debug("Dealer.search 2 TOTAL: {}".format(total))
|
2024-11-12 14:59:41 +08:00
|
|
|
|
|
|
|
|
|
|
for k in keywords:
|
|
|
|
|
|
kwds.add(k)
|
2024-11-28 13:00:38 +08:00
|
|
|
|
for kk in rag_tokenizer.fine_grained_tokenize(k).split():
|
2024-11-12 14:59:41 +08:00
|
|
|
|
if len(kk) < 2:
|
|
|
|
|
|
continue
|
|
|
|
|
|
if kk in kwds:
|
|
|
|
|
|
continue
|
|
|
|
|
|
kwds.add(kk)
|
|
|
|
|
|
|
2024-11-14 17:13:48 +08:00
|
|
|
|
logging.debug(f"TOTAL: {total}")
|
2025-12-25 21:18:13 +08:00
|
|
|
|
ids = self.dataStore.get_doc_ids(res)
|
2025-01-09 17:07:21 +08:00
|
|
|
|
keywords = list(kwds)
|
2025-11-12 19:00:15 +08:00
|
|
|
|
highlight = self.dataStore.get_highlight(res, keywords, "content_with_weight")
|
|
|
|
|
|
aggs = self.dataStore.get_aggregation(res, "docnm_kwd")
|
2023-12-25 19:05:59 +08:00
|
|
|
|
return self.SearchResult(
|
2024-11-12 14:59:41 +08:00
|
|
|
|
total=total,
|
|
|
|
|
|
ids=ids,
|
2023-12-28 13:50:13 +08:00
|
|
|
|
query_vector=q_vec,
|
|
|
|
|
|
aggregation=aggs,
|
2024-11-12 14:59:41 +08:00
|
|
|
|
highlight=highlight,
|
2025-11-12 19:00:15 +08:00
|
|
|
|
field=self.dataStore.get_fields(res, src + ["_score"]),
|
2024-11-12 14:59:41 +08:00
|
|
|
|
keywords=keywords
|
2023-12-25 19:05:59 +08:00
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
|
|
def trans2floats(txt):
|
2025-03-18 11:13:44 +08:00
|
|
|
|
return [get_float(t) for t in txt.split("\t")]
|
2023-12-25 19:05:59 +08:00
|
|
|
|
|
2024-01-23 19:45:36 +08:00
|
|
|
|
def insert_citations(self, answer, chunks, chunk_v,
|
2024-03-20 18:57:22 +08:00
|
|
|
|
embd_mdl, tkweight=0.1, vtweight=0.9):
|
2024-02-27 14:57:34 +08:00
|
|
|
|
assert len(chunks) == len(chunk_v)
|
2024-09-12 17:51:20 +08:00
|
|
|
|
if not chunks:
|
|
|
|
|
|
return answer, set([])
|
2024-03-20 16:56:16 +08:00
|
|
|
|
pieces = re.split(r"(```)", answer)
|
|
|
|
|
|
if len(pieces) >= 3:
|
|
|
|
|
|
i = 0
|
|
|
|
|
|
pieces_ = []
|
|
|
|
|
|
while i < len(pieces):
|
|
|
|
|
|
if pieces[i] == "```":
|
|
|
|
|
|
st = i
|
|
|
|
|
|
i += 1
|
2024-03-27 11:33:46 +08:00
|
|
|
|
while i < len(pieces) and pieces[i] != "```":
|
2024-03-20 16:56:16 +08:00
|
|
|
|
i += 1
|
2024-03-27 11:33:46 +08:00
|
|
|
|
if i < len(pieces):
|
|
|
|
|
|
i += 1
|
|
|
|
|
|
pieces_.append("".join(pieces[st: i]) + "\n")
|
2024-03-20 16:56:16 +08:00
|
|
|
|
else:
|
Feature rtl support (#13118)
### What problem does this PR solve?
This PR adds comprehensive **Right-to-Left (RTL) language support**,
primarily targeting Arabic and other RTL scripts (Hebrew, Persian, Urdu,
etc.).
Previously, RTL content had multiple rendering issues:
- Incorrect sentence splitting for Arabic punctuation in citation logic
- Misaligned text in chat messages and markdown components
- Improper positioning of blockquotes and “think” sections
- Incorrect table alignment
- Citation placement ambiguity in RTL prompts
- UI layout inconsistencies when mixing LTR and RTL text
This PR introduces backend and frontend improvements to properly detect,
render, and style RTL content while preserving existing LTR behavior.
#### Backend
- Updated sentence boundary regex in `rag/nlp/search.py` to include
Arabic punctuation:
- `،` (comma)
- `؛` (semicolon)
- `؟` (question mark)
- `۔` (Arabic full stop)
- Ensures citation insertion works correctly in RTL sentences.
- Updated citation prompt instructions to clarify citation placement
rules for RTL languages.
#### Frontend
- Introduced a new utility: `text-direction.ts`
- Detects text direction based on Unicode ranges.
- Supports Arabic, Hebrew, Syriac, Thaana, and related scripts.
- Provides `getDirAttribute()` for automatic `dir` assignment.
- Applied dynamic `dir` attributes across:
- Markdown rendering
- Chat messages
- Search results
- Tables
- Hover cards and reference popovers
- Added proper RTL styling in LESS:
- Text alignment adjustments
- Blockquote border flipping
- Section indentation correction
- Table direction switching
- Use of `<bdi>` for figure labels to prevent bidirectional conflicts
#### DevOps / Environment
- Added Windows backend launch script with retry handling.
- Updated dependency metadata.
- Adjusted development-only React debugging behavior.
---
### Type of change
- [x] Bug Fix (non-breaking change which fixes RTL rendering and
citation issues)
- [x] New Feature (non-breaking change which adds RTL detection and
dynamic direction handling)
---------
Co-authored-by: 6ba3i <isbaaoui09@gmail.com>
Co-authored-by: Ahmad Intisar <ahmadintisar@Ahmads-MacBook-M4-Pro.local>
Co-authored-by: Ahmad Intisar <168020872+ahmadintisar@users.noreply.github.com>
Co-authored-by: Liu An <asiro@qq.com>
2026-03-02 08:03:44 +03:00
|
|
|
|
# Sentence boundary regex includes Arabic punctuation (، ؛ ؟ ۔)
|
2024-03-27 11:33:46 +08:00
|
|
|
|
pieces_.extend(
|
|
|
|
|
|
re.split(
|
Feature rtl support (#13118)
### What problem does this PR solve?
This PR adds comprehensive **Right-to-Left (RTL) language support**,
primarily targeting Arabic and other RTL scripts (Hebrew, Persian, Urdu,
etc.).
Previously, RTL content had multiple rendering issues:
- Incorrect sentence splitting for Arabic punctuation in citation logic
- Misaligned text in chat messages and markdown components
- Improper positioning of blockquotes and “think” sections
- Incorrect table alignment
- Citation placement ambiguity in RTL prompts
- UI layout inconsistencies when mixing LTR and RTL text
This PR introduces backend and frontend improvements to properly detect,
render, and style RTL content while preserving existing LTR behavior.
#### Backend
- Updated sentence boundary regex in `rag/nlp/search.py` to include
Arabic punctuation:
- `،` (comma)
- `؛` (semicolon)
- `؟` (question mark)
- `۔` (Arabic full stop)
- Ensures citation insertion works correctly in RTL sentences.
- Updated citation prompt instructions to clarify citation placement
rules for RTL languages.
#### Frontend
- Introduced a new utility: `text-direction.ts`
- Detects text direction based on Unicode ranges.
- Supports Arabic, Hebrew, Syriac, Thaana, and related scripts.
- Provides `getDirAttribute()` for automatic `dir` assignment.
- Applied dynamic `dir` attributes across:
- Markdown rendering
- Chat messages
- Search results
- Tables
- Hover cards and reference popovers
- Added proper RTL styling in LESS:
- Text alignment adjustments
- Blockquote border flipping
- Section indentation correction
- Table direction switching
- Use of `<bdi>` for figure labels to prevent bidirectional conflicts
#### DevOps / Environment
- Added Windows backend launch script with retry handling.
- Updated dependency metadata.
- Adjusted development-only React debugging behavior.
---
### Type of change
- [x] Bug Fix (non-breaking change which fixes RTL rendering and
citation issues)
- [x] New Feature (non-breaking change which adds RTL detection and
dynamic direction handling)
---------
Co-authored-by: 6ba3i <isbaaoui09@gmail.com>
Co-authored-by: Ahmad Intisar <ahmadintisar@Ahmads-MacBook-M4-Pro.local>
Co-authored-by: Ahmad Intisar <168020872+ahmadintisar@users.noreply.github.com>
Co-authored-by: Liu An <asiro@qq.com>
2026-03-02 08:03:44 +03:00
|
|
|
|
r"([^\|][;。?!!،؛؟۔\n]|[a-z\u0600-\u06FF][.?;!،؛؟][ \n])",
|
2024-03-27 11:33:46 +08:00
|
|
|
|
pieces[i]))
|
2024-03-20 16:56:16 +08:00
|
|
|
|
i += 1
|
|
|
|
|
|
pieces = pieces_
|
|
|
|
|
|
else:
|
Feature rtl support (#13118)
### What problem does this PR solve?
This PR adds comprehensive **Right-to-Left (RTL) language support**,
primarily targeting Arabic and other RTL scripts (Hebrew, Persian, Urdu,
etc.).
Previously, RTL content had multiple rendering issues:
- Incorrect sentence splitting for Arabic punctuation in citation logic
- Misaligned text in chat messages and markdown components
- Improper positioning of blockquotes and “think” sections
- Incorrect table alignment
- Citation placement ambiguity in RTL prompts
- UI layout inconsistencies when mixing LTR and RTL text
This PR introduces backend and frontend improvements to properly detect,
render, and style RTL content while preserving existing LTR behavior.
#### Backend
- Updated sentence boundary regex in `rag/nlp/search.py` to include
Arabic punctuation:
- `،` (comma)
- `؛` (semicolon)
- `؟` (question mark)
- `۔` (Arabic full stop)
- Ensures citation insertion works correctly in RTL sentences.
- Updated citation prompt instructions to clarify citation placement
rules for RTL languages.
#### Frontend
- Introduced a new utility: `text-direction.ts`
- Detects text direction based on Unicode ranges.
- Supports Arabic, Hebrew, Syriac, Thaana, and related scripts.
- Provides `getDirAttribute()` for automatic `dir` assignment.
- Applied dynamic `dir` attributes across:
- Markdown rendering
- Chat messages
- Search results
- Tables
- Hover cards and reference popovers
- Added proper RTL styling in LESS:
- Text alignment adjustments
- Blockquote border flipping
- Section indentation correction
- Table direction switching
- Use of `<bdi>` for figure labels to prevent bidirectional conflicts
#### DevOps / Environment
- Added Windows backend launch script with retry handling.
- Updated dependency metadata.
- Adjusted development-only React debugging behavior.
---
### Type of change
- [x] Bug Fix (non-breaking change which fixes RTL rendering and
citation issues)
- [x] New Feature (non-breaking change which adds RTL detection and
dynamic direction handling)
---------
Co-authored-by: 6ba3i <isbaaoui09@gmail.com>
Co-authored-by: Ahmad Intisar <ahmadintisar@Ahmads-MacBook-M4-Pro.local>
Co-authored-by: Ahmad Intisar <168020872+ahmadintisar@users.noreply.github.com>
Co-authored-by: Liu An <asiro@qq.com>
2026-03-02 08:03:44 +03:00
|
|
|
|
# Sentence boundary regex includes Arabic punctuation (، ؛ ؟ ۔)
|
|
|
|
|
|
pieces = re.split(r"([^\|][;。?!!،؛؟۔\n]|[a-z\u0600-\u06FF][.?;!،؛؟][ \n])", answer)
|
2024-01-18 19:28:37 +08:00
|
|
|
|
for i in range(1, len(pieces)):
|
Feature rtl support (#13118)
### What problem does this PR solve?
This PR adds comprehensive **Right-to-Left (RTL) language support**,
primarily targeting Arabic and other RTL scripts (Hebrew, Persian, Urdu,
etc.).
Previously, RTL content had multiple rendering issues:
- Incorrect sentence splitting for Arabic punctuation in citation logic
- Misaligned text in chat messages and markdown components
- Improper positioning of blockquotes and “think” sections
- Incorrect table alignment
- Citation placement ambiguity in RTL prompts
- UI layout inconsistencies when mixing LTR and RTL text
This PR introduces backend and frontend improvements to properly detect,
render, and style RTL content while preserving existing LTR behavior.
#### Backend
- Updated sentence boundary regex in `rag/nlp/search.py` to include
Arabic punctuation:
- `،` (comma)
- `؛` (semicolon)
- `؟` (question mark)
- `۔` (Arabic full stop)
- Ensures citation insertion works correctly in RTL sentences.
- Updated citation prompt instructions to clarify citation placement
rules for RTL languages.
#### Frontend
- Introduced a new utility: `text-direction.ts`
- Detects text direction based on Unicode ranges.
- Supports Arabic, Hebrew, Syriac, Thaana, and related scripts.
- Provides `getDirAttribute()` for automatic `dir` assignment.
- Applied dynamic `dir` attributes across:
- Markdown rendering
- Chat messages
- Search results
- Tables
- Hover cards and reference popovers
- Added proper RTL styling in LESS:
- Text alignment adjustments
- Blockquote border flipping
- Section indentation correction
- Table direction switching
- Use of `<bdi>` for figure labels to prevent bidirectional conflicts
#### DevOps / Environment
- Added Windows backend launch script with retry handling.
- Updated dependency metadata.
- Adjusted development-only React debugging behavior.
---
### Type of change
- [x] Bug Fix (non-breaking change which fixes RTL rendering and
citation issues)
- [x] New Feature (non-breaking change which adds RTL detection and
dynamic direction handling)
---------
Co-authored-by: 6ba3i <isbaaoui09@gmail.com>
Co-authored-by: Ahmad Intisar <ahmadintisar@Ahmads-MacBook-M4-Pro.local>
Co-authored-by: Ahmad Intisar <168020872+ahmadintisar@users.noreply.github.com>
Co-authored-by: Liu An <asiro@qq.com>
2026-03-02 08:03:44 +03:00
|
|
|
|
if re.match(r"([^\|][;。?!!،؛؟۔\n]|[a-z\u0600-\u06FF][.?;!،؛؟][ \n])", pieces[i]):
|
2024-01-18 19:28:37 +08:00
|
|
|
|
pieces[i - 1] += pieces[i][0]
|
|
|
|
|
|
pieces[i] = pieces[i][1:]
|
|
|
|
|
|
idx = []
|
|
|
|
|
|
pieces_ = []
|
|
|
|
|
|
for i, t in enumerate(pieces):
|
2024-01-23 19:45:36 +08:00
|
|
|
|
if len(t) < 5:
|
|
|
|
|
|
continue
|
2024-01-18 19:28:37 +08:00
|
|
|
|
idx.append(i)
|
|
|
|
|
|
pieces_.append(t)
|
2024-11-14 17:13:48 +08:00
|
|
|
|
logging.debug("{} => {}".format(answer, pieces_))
|
2024-01-23 19:45:36 +08:00
|
|
|
|
if not pieces_:
|
2024-04-16 19:45:14 +08:00
|
|
|
|
return answer, set([])
|
2023-12-25 19:05:59 +08:00
|
|
|
|
|
2024-01-23 19:45:36 +08:00
|
|
|
|
ans_v, _ = embd_mdl.encode(pieces_)
|
2025-01-22 19:43:14 +08:00
|
|
|
|
for i in range(len(chunk_v)):
|
|
|
|
|
|
if len(ans_v[0]) != len(chunk_v[i]):
|
2025-12-29 12:01:18 +08:00
|
|
|
|
chunk_v[i] = [0.0] * len(ans_v[0])
|
|
|
|
|
|
logging.warning(
|
|
|
|
|
|
"The dimension of query and chunk do not match: {} vs. {}".format(len(ans_v[0]), len(chunk_v[i])))
|
2025-01-22 19:43:14 +08:00
|
|
|
|
|
2024-01-18 19:28:37 +08:00
|
|
|
|
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
|
2025-01-09 17:07:21 +08:00
|
|
|
|
len(ans_v[0]), len(chunk_v[0]))
|
2023-12-28 13:50:13 +08:00
|
|
|
|
|
2024-11-28 13:00:38 +08:00
|
|
|
|
chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split()
|
2024-03-27 11:33:46 +08:00
|
|
|
|
for ck in chunks]
|
2024-01-18 19:28:37 +08:00
|
|
|
|
cites = {}
|
2024-03-28 11:45:50 +08:00
|
|
|
|
thr = 0.63
|
2025-01-09 17:07:21 +08:00
|
|
|
|
while thr > 0.3 and len(cites.keys()) == 0 and pieces_ and chunks_tks:
|
2024-03-28 11:45:50 +08:00
|
|
|
|
for i, a in enumerate(pieces_):
|
|
|
|
|
|
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
|
|
|
|
|
|
chunk_v,
|
2024-04-28 19:13:33 +08:00
|
|
|
|
rag_tokenizer.tokenize(
|
2024-11-28 13:00:38 +08:00
|
|
|
|
self.qryr.rmWWW(pieces_[i])).split(),
|
2024-03-28 11:45:50 +08:00
|
|
|
|
chunks_tks,
|
|
|
|
|
|
tkweight, vtweight)
|
|
|
|
|
|
mx = np.max(sim) * 0.99
|
2024-11-14 17:13:48 +08:00
|
|
|
|
logging.debug("{} SIM: {}".format(pieces_[i], mx))
|
2024-03-28 11:45:50 +08:00
|
|
|
|
if mx < thr:
|
|
|
|
|
|
continue
|
|
|
|
|
|
cites[idx[i]] = list(
|
|
|
|
|
|
set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4]
|
|
|
|
|
|
thr *= 0.8
|
2023-12-25 19:05:59 +08:00
|
|
|
|
|
2024-01-18 19:28:37 +08:00
|
|
|
|
res = ""
|
2024-03-07 16:12:01 +08:00
|
|
|
|
seted = set([])
|
2024-01-23 19:45:36 +08:00
|
|
|
|
for i, p in enumerate(pieces):
|
2024-01-18 19:28:37 +08:00
|
|
|
|
res += p
|
2024-01-23 19:45:36 +08:00
|
|
|
|
if i not in idx:
|
|
|
|
|
|
continue
|
|
|
|
|
|
if i not in cites:
|
|
|
|
|
|
continue
|
2024-03-07 16:12:01 +08:00
|
|
|
|
for c in cites[i]:
|
2024-03-27 11:33:46 +08:00
|
|
|
|
assert int(c) < len(chunk_v)
|
|
|
|
|
|
for c in cites[i]:
|
|
|
|
|
|
if c in seted:
|
|
|
|
|
|
continue
|
2025-05-29 10:03:51 +08:00
|
|
|
|
res += f" [ID:{c}]"
|
2024-03-07 16:12:01 +08:00
|
|
|
|
seted.add(c)
|
2023-12-25 19:05:59 +08:00
|
|
|
|
|
2024-03-07 17:21:38 +08:00
|
|
|
|
return res, seted
|
2023-12-25 19:05:59 +08:00
|
|
|
|
|
2025-01-09 17:07:21 +08:00
|
|
|
|
def _rank_feature_scores(self, query_rfea, search_res):
|
|
|
|
|
|
## For rank feature(tag_fea) scores.
|
|
|
|
|
|
rank_fea = []
|
|
|
|
|
|
pageranks = []
|
|
|
|
|
|
for chunk_id in search_res.ids:
|
|
|
|
|
|
pageranks.append(search_res.field[chunk_id].get(PAGERANK_FLD, 0))
|
|
|
|
|
|
pageranks = np.array(pageranks, dtype=float)
|
|
|
|
|
|
|
|
|
|
|
|
if not query_rfea:
|
|
|
|
|
|
return np.array([0 for _ in range(len(search_res.ids))]) + pageranks
|
|
|
|
|
|
|
2025-12-29 12:01:18 +08:00
|
|
|
|
q_denor = np.sqrt(np.sum([s * s for t, s in query_rfea.items() if t != PAGERANK_FLD]))
|
2026-04-15 16:31:11 +08:00
|
|
|
|
if q_denor == 0:
|
|
|
|
|
|
return np.array([0 for _ in range(len(search_res.ids))]) + pageranks
|
2025-01-09 17:07:21 +08:00
|
|
|
|
for i in search_res.ids:
|
|
|
|
|
|
nor, denor = 0, 0
|
2025-03-14 17:35:57 +08:00
|
|
|
|
if not search_res.field[i].get(TAG_FLD):
|
2025-03-18 16:07:29 +08:00
|
|
|
|
rank_fea.append(0)
|
2025-03-14 17:35:57 +08:00
|
|
|
|
continue
|
2026-04-15 16:31:11 +08:00
|
|
|
|
tag_feas = parse_tag_features(search_res.field[i].get(TAG_FLD), allow_json_string=True, allow_python_literal=True)
|
|
|
|
|
|
if not tag_feas:
|
|
|
|
|
|
rank_fea.append(0)
|
|
|
|
|
|
continue
|
|
|
|
|
|
for t, sc in tag_feas.items():
|
2025-01-09 17:07:21 +08:00
|
|
|
|
if t in query_rfea:
|
|
|
|
|
|
nor += query_rfea[t] * sc
|
|
|
|
|
|
denor += sc * sc
|
|
|
|
|
|
if denor == 0:
|
|
|
|
|
|
rank_fea.append(0)
|
|
|
|
|
|
else:
|
2025-12-29 12:01:18 +08:00
|
|
|
|
rank_fea.append(nor / np.sqrt(denor) / q_denor)
|
|
|
|
|
|
return np.array(rank_fea) * 10. + pageranks
|
2025-01-09 17:07:21 +08:00
|
|
|
|
|
2024-01-23 19:45:36 +08:00
|
|
|
|
def rerank(self, sres, query, tkweight=0.3,
|
2025-01-09 17:07:21 +08:00
|
|
|
|
vtweight=0.7, cfield="content_ltks",
|
|
|
|
|
|
rank_feature: dict | None = None
|
|
|
|
|
|
):
|
2024-03-20 16:56:16 +08:00
|
|
|
|
_, keywords = self.qryr.question(query)
|
2024-11-12 14:59:41 +08:00
|
|
|
|
vector_size = len(sres.query_vector)
|
|
|
|
|
|
vector_column = f"q_{vector_size}_vec"
|
|
|
|
|
|
zero_vector = [0.0] * vector_size
|
|
|
|
|
|
ins_embd = []
|
|
|
|
|
|
for chunk_id in sres.ids:
|
|
|
|
|
|
vector = sres.field[chunk_id].get(vector_column, zero_vector)
|
|
|
|
|
|
if isinstance(vector, str):
|
2025-03-18 11:13:44 +08:00
|
|
|
|
vector = [get_float(v) for v in vector.split("\t")]
|
2024-11-12 14:59:41 +08:00
|
|
|
|
ins_embd.append(vector)
|
2023-12-28 13:50:13 +08:00
|
|
|
|
if not ins_embd:
|
2024-01-22 19:51:38 +08:00
|
|
|
|
return [], [], []
|
2024-04-19 18:02:53 +08:00
|
|
|
|
|
|
|
|
|
|
for i in sres.ids:
|
|
|
|
|
|
if isinstance(sres.field[i].get("important_kwd", []), str):
|
|
|
|
|
|
sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
|
|
|
|
|
|
ins_tw = []
|
|
|
|
|
|
for i in sres.ids:
|
2025-03-28 09:33:08 +08:00
|
|
|
|
content_ltks = list(OrderedDict.fromkeys(sres.field[i][cfield].split()))
|
2024-11-28 13:00:38 +08:00
|
|
|
|
title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
|
2024-12-05 14:51:19 +08:00
|
|
|
|
question_tks = [t for t in sres.field[i].get("question_tks", "").split() if t]
|
2024-04-19 18:02:53 +08:00
|
|
|
|
important_kwd = sres.field[i].get("important_kwd", [])
|
2025-01-09 17:07:21 +08:00
|
|
|
|
tks = content_ltks + title_tks * 2 + important_kwd * 5 + question_tks * 6
|
2024-04-19 18:02:53 +08:00
|
|
|
|
ins_tw.append(tks)
|
|
|
|
|
|
|
2025-01-09 17:07:21 +08:00
|
|
|
|
## For rank feature(tag_fea) scores.
|
|
|
|
|
|
rank_fea = self._rank_feature_scores(rank_feature, sres)
|
|
|
|
|
|
|
2024-01-17 20:20:42 +08:00
|
|
|
|
sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
|
2024-01-18 19:28:37 +08:00
|
|
|
|
ins_embd,
|
2024-03-20 16:56:16 +08:00
|
|
|
|
keywords,
|
2024-01-18 19:28:37 +08:00
|
|
|
|
ins_tw, tkweight, vtweight)
|
2024-12-03 14:30:35 +08:00
|
|
|
|
|
2025-01-09 17:07:21 +08:00
|
|
|
|
return sim + rank_fea, tksim, vtsim
|
2023-12-25 19:05:59 +08:00
|
|
|
|
|
2024-05-29 16:50:02 +08:00
|
|
|
|
def rerank_by_model(self, rerank_mdl, sres, query, tkweight=0.3,
|
2025-01-09 17:07:21 +08:00
|
|
|
|
vtweight=0.7, cfield="content_ltks",
|
|
|
|
|
|
rank_feature: dict | None = None):
|
2024-05-29 16:50:02 +08:00
|
|
|
|
_, keywords = self.qryr.question(query)
|
|
|
|
|
|
|
|
|
|
|
|
for i in sres.ids:
|
|
|
|
|
|
if isinstance(sres.field[i].get("important_kwd", []), str):
|
|
|
|
|
|
sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
|
|
|
|
|
|
ins_tw = []
|
|
|
|
|
|
for i in sres.ids:
|
2024-11-28 13:00:38 +08:00
|
|
|
|
content_ltks = sres.field[i][cfield].split()
|
|
|
|
|
|
title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
|
2024-05-29 16:50:02 +08:00
|
|
|
|
important_kwd = sres.field[i].get("important_kwd", [])
|
|
|
|
|
|
tks = content_ltks + title_tks + important_kwd
|
|
|
|
|
|
ins_tw.append(tks)
|
|
|
|
|
|
|
|
|
|
|
|
tksim = self.qryr.token_similarity(keywords, ins_tw)
|
2025-10-28 09:46:32 +08:00
|
|
|
|
vtsim, _ = rerank_mdl.similarity(query, [remove_redundant_spaces(" ".join(tks)) for tks in ins_tw])
|
2025-01-09 17:07:21 +08:00
|
|
|
|
## For rank feature(tag_fea) scores.
|
|
|
|
|
|
rank_fea = self._rank_feature_scores(rank_feature, sres)
|
2024-05-29 16:50:02 +08:00
|
|
|
|
|
2025-11-14 13:59:54 +08:00
|
|
|
|
return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea, tksim, vtsim
|
2024-05-29 16:50:02 +08:00
|
|
|
|
|
2024-01-18 19:28:37 +08:00
|
|
|
|
def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
|
|
|
|
|
|
return self.qryr.hybrid_similarity(ans_embd,
|
|
|
|
|
|
ins_embd,
|
2024-11-28 13:00:38 +08:00
|
|
|
|
rag_tokenizer.tokenize(ans).split(),
|
|
|
|
|
|
rag_tokenizer.tokenize(inst).split())
|
2024-01-18 19:28:37 +08:00
|
|
|
|
|
2026-01-15 12:28:49 +08:00
|
|
|
|
async def retrieval(
|
2025-12-29 12:01:18 +08:00
|
|
|
|
self,
|
|
|
|
|
|
question,
|
|
|
|
|
|
embd_mdl,
|
|
|
|
|
|
tenant_ids,
|
|
|
|
|
|
kb_ids,
|
|
|
|
|
|
page,
|
|
|
|
|
|
page_size,
|
|
|
|
|
|
similarity_threshold=0.2,
|
|
|
|
|
|
vector_similarity_weight=0.3,
|
|
|
|
|
|
top=1024,
|
|
|
|
|
|
doc_ids=None,
|
|
|
|
|
|
aggs=True,
|
|
|
|
|
|
rerank_mdl=None,
|
|
|
|
|
|
highlight=False,
|
|
|
|
|
|
rank_feature: dict | None = {PAGERANK_FLD: 10},
|
2025-11-20 15:35:09 +08:00
|
|
|
|
):
|
2024-01-22 19:51:38 +08:00
|
|
|
|
ranks = {"total": 0, "chunks": [], "doc_aggs": {}}
|
2024-01-23 19:45:36 +08:00
|
|
|
|
if not question:
|
|
|
|
|
|
return ranks
|
2024-10-29 13:19:01 +08:00
|
|
|
|
|
2026-04-14 10:47:25 +08:00
|
|
|
|
# Keep the historical windowing strategy by default, but when an external
|
|
|
|
|
|
# reranker is enabled cap candidate count by both top_k and provider-safe 64.
|
2025-11-20 15:35:09 +08:00
|
|
|
|
RERANK_LIMIT = math.ceil(64 / page_size) * page_size if page_size > 1 else 1
|
2026-01-13 09:41:35 +08:00
|
|
|
|
RERANK_LIMIT = max(30, RERANK_LIMIT)
|
2026-04-14 10:47:25 +08:00
|
|
|
|
if rerank_mdl and top > 0:
|
|
|
|
|
|
RERANK_LIMIT = min(RERANK_LIMIT, top, 64)
|
|
|
|
|
|
page = max(page, 1)
|
|
|
|
|
|
global_offset = (page - 1) * page_size
|
2025-11-20 15:35:09 +08:00
|
|
|
|
req = {
|
|
|
|
|
|
"kb_ids": kb_ids,
|
|
|
|
|
|
"doc_ids": doc_ids,
|
2026-04-14 10:47:25 +08:00
|
|
|
|
"page": global_offset // RERANK_LIMIT + 1,
|
2025-11-20 15:35:09 +08:00
|
|
|
|
"size": RERANK_LIMIT,
|
|
|
|
|
|
"question": question,
|
|
|
|
|
|
"vector": True,
|
|
|
|
|
|
"topk": top,
|
|
|
|
|
|
"similarity": similarity_threshold,
|
|
|
|
|
|
"available_int": 1,
|
|
|
|
|
|
}
|
2025-04-22 15:16:04 +08:00
|
|
|
|
|
2024-10-29 13:19:01 +08:00
|
|
|
|
if isinstance(tenant_ids, str):
|
|
|
|
|
|
tenant_ids = tenant_ids.split(",")
|
|
|
|
|
|
|
2026-01-15 12:28:49 +08:00
|
|
|
|
sres = await self.search(req, [index_name(tid) for tid in tenant_ids], kb_ids, embd_mdl, highlight,
|
2025-12-29 12:01:18 +08:00
|
|
|
|
rank_feature=rank_feature)
|
2023-12-25 19:05:59 +08:00
|
|
|
|
|
2025-03-17 13:26:29 +08:00
|
|
|
|
if rerank_mdl and sres.total > 0:
|
2025-11-20 15:35:09 +08:00
|
|
|
|
sim, tsim, vsim = self.rerank_by_model(
|
|
|
|
|
|
rerank_mdl,
|
|
|
|
|
|
sres,
|
|
|
|
|
|
question,
|
|
|
|
|
|
1 - vector_similarity_weight,
|
|
|
|
|
|
vector_similarity_weight,
|
|
|
|
|
|
rank_feature=rank_feature,
|
|
|
|
|
|
)
|
2024-05-29 16:50:02 +08:00
|
|
|
|
else:
|
2025-11-26 11:06:37 +08:00
|
|
|
|
if settings.DOC_ENGINE_INFINITY:
|
|
|
|
|
|
# Don't need rerank here since Infinity normalizes each way score before fusion.
|
|
|
|
|
|
sim = [sres.field[id].get("_score", 0.0) for id in sres.ids]
|
|
|
|
|
|
sim = [s if s is not None else 0.0 for s in sim]
|
|
|
|
|
|
tsim = sim
|
|
|
|
|
|
vsim = sim
|
|
|
|
|
|
else:
|
2025-10-15 20:15:49 +08:00
|
|
|
|
# ElasticSearch doesn't normalize each way score before fusion.
|
|
|
|
|
|
sim, tsim, vsim = self.rerank(
|
2025-11-20 15:35:09 +08:00
|
|
|
|
sres,
|
|
|
|
|
|
question,
|
|
|
|
|
|
1 - vector_similarity_weight,
|
|
|
|
|
|
vector_similarity_weight,
|
|
|
|
|
|
rank_feature=rank_feature,
|
|
|
|
|
|
)
|
|
|
|
|
|
|
2025-10-30 17:30:54 +08:00
|
|
|
|
sim_np = np.array(sim, dtype=np.float64)
|
2025-11-20 15:35:09 +08:00
|
|
|
|
if sim_np.size == 0:
|
2025-11-28 13:09:05 +08:00
|
|
|
|
ranks["doc_aggs"] = []
|
2025-11-20 15:35:09 +08:00
|
|
|
|
return ranks
|
|
|
|
|
|
|
|
|
|
|
|
sorted_idx = np.argsort(sim_np * -1)
|
|
|
|
|
|
|
2026-02-09 14:56:10 +08:00
|
|
|
|
# When vector_similarity_weight is 0, similarity_threshold is not meaningful for term-only scores.
|
|
|
|
|
|
post_threshold = 0.0 if vector_similarity_weight <= 0 else similarity_threshold
|
2026-03-10 11:57:32 +08:00
|
|
|
|
|
|
|
|
|
|
# When doc_ids is explicitly provided (metadata or document filtering), bypass threshold
|
|
|
|
|
|
# User wants those specific documents regardless of their relevance score
|
|
|
|
|
|
if doc_ids:
|
|
|
|
|
|
post_threshold = 0.0
|
|
|
|
|
|
|
2026-02-09 14:56:10 +08:00
|
|
|
|
valid_idx = [int(i) for i in sorted_idx if sim_np[i] >= post_threshold]
|
2025-11-20 15:35:09 +08:00
|
|
|
|
filtered_count = len(valid_idx)
|
|
|
|
|
|
ranks["total"] = int(filtered_count)
|
|
|
|
|
|
|
|
|
|
|
|
if filtered_count == 0:
|
2025-11-28 13:09:05 +08:00
|
|
|
|
ranks["doc_aggs"] = []
|
2025-11-20 15:35:09 +08:00
|
|
|
|
return ranks
|
|
|
|
|
|
|
2026-04-14 10:47:25 +08:00
|
|
|
|
begin = global_offset % RERANK_LIMIT
|
2025-11-20 15:35:09 +08:00
|
|
|
|
end = begin + page_size
|
|
|
|
|
|
page_idx = valid_idx[begin:end]
|
|
|
|
|
|
|
2024-01-18 19:28:37 +08:00
|
|
|
|
dim = len(sres.query_vector)
|
2024-11-12 14:59:41 +08:00
|
|
|
|
vector_column = f"q_{dim}_vec"
|
|
|
|
|
|
zero_vector = [0.0] * dim
|
2025-06-23 14:54:01 +08:00
|
|
|
|
|
2025-11-20 15:35:09 +08:00
|
|
|
|
for i in page_idx:
|
2024-01-18 19:28:37 +08:00
|
|
|
|
id = sres.ids[i]
|
2024-11-12 14:59:41 +08:00
|
|
|
|
chunk = sres.field[id]
|
2025-02-06 17:34:53 +08:00
|
|
|
|
dnm = chunk.get("docnm_kwd", "")
|
|
|
|
|
|
did = chunk.get("doc_id", "")
|
2025-06-23 14:54:01 +08:00
|
|
|
|
|
2024-12-10 16:32:58 +08:00
|
|
|
|
position_int = chunk.get("position_int", [])
|
2024-01-18 19:28:37 +08:00
|
|
|
|
d = {
|
|
|
|
|
|
"chunk_id": id,
|
2024-11-12 14:59:41 +08:00
|
|
|
|
"content_ltks": chunk["content_ltks"],
|
|
|
|
|
|
"content_with_weight": chunk["content_with_weight"],
|
2025-02-06 17:34:53 +08:00
|
|
|
|
"doc_id": did,
|
2024-01-18 19:28:37 +08:00
|
|
|
|
"docnm_kwd": dnm,
|
2024-11-12 14:59:41 +08:00
|
|
|
|
"kb_id": chunk["kb_id"],
|
|
|
|
|
|
"important_kwd": chunk.get("important_kwd", []),
|
2026-03-29 20:17:01 +08:00
|
|
|
|
"tag_kwd": chunk.get("tag_kwd", []),
|
2024-11-12 14:59:41 +08:00
|
|
|
|
"image_id": chunk.get("img_id", ""),
|
2025-11-20 15:35:09 +08:00
|
|
|
|
"similarity": float(sim_np[i]),
|
|
|
|
|
|
"vector_similarity": float(vsim[i]),
|
|
|
|
|
|
"term_similarity": float(tsim[i]),
|
2024-11-12 14:59:41 +08:00
|
|
|
|
"vector": chunk.get(vector_column, zero_vector),
|
2024-12-10 16:32:58 +08:00
|
|
|
|
"positions": position_int,
|
2025-11-20 15:35:09 +08:00
|
|
|
|
"doc_type_kwd": chunk.get("doc_type_kwd", ""),
|
2025-12-09 09:34:01 +08:00
|
|
|
|
"mom_id": chunk.get("mom_id", ""),
|
2026-04-02 18:56:43 +08:00
|
|
|
|
"row_id": chunk.get("row_id()"),
|
2024-01-18 19:28:37 +08:00
|
|
|
|
}
|
2024-12-03 14:30:35 +08:00
|
|
|
|
if highlight and sres.highlight:
|
2024-09-12 17:51:20 +08:00
|
|
|
|
if id in sres.highlight:
|
2025-10-28 09:46:32 +08:00
|
|
|
|
d["highlight"] = remove_redundant_spaces(sres.highlight[id])
|
2024-09-12 17:51:20 +08:00
|
|
|
|
else:
|
|
|
|
|
|
d["highlight"] = d["content_with_weight"]
|
2024-01-18 19:28:37 +08:00
|
|
|
|
ranks["chunks"].append(d)
|
2025-11-20 15:35:09 +08:00
|
|
|
|
|
|
|
|
|
|
if aggs:
|
|
|
|
|
|
for i in valid_idx:
|
|
|
|
|
|
id = sres.ids[i]
|
|
|
|
|
|
chunk = sres.field[id]
|
|
|
|
|
|
dnm = chunk.get("docnm_kwd", "")
|
|
|
|
|
|
did = chunk.get("doc_id", "")
|
|
|
|
|
|
if dnm not in ranks["doc_aggs"]:
|
|
|
|
|
|
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
|
|
|
|
|
|
ranks["doc_aggs"][dnm]["count"] += 1
|
|
|
|
|
|
|
|
|
|
|
|
ranks["doc_aggs"] = [
|
|
|
|
|
|
{
|
|
|
|
|
|
"doc_name": k,
|
|
|
|
|
|
"doc_id": v["doc_id"],
|
|
|
|
|
|
"count": v["count"],
|
|
|
|
|
|
}
|
|
|
|
|
|
for k, v in sorted(
|
|
|
|
|
|
ranks["doc_aggs"].items(),
|
|
|
|
|
|
key=lambda x: x[1]["count"] * -1,
|
|
|
|
|
|
)
|
|
|
|
|
|
]
|
|
|
|
|
|
else:
|
|
|
|
|
|
ranks["doc_aggs"] = []
|
2024-01-17 09:39:50 +08:00
|
|
|
|
|
2024-01-18 19:28:37 +08:00
|
|
|
|
return ranks
|
2024-02-07 19:27:23 +08:00
|
|
|
|
|
2024-02-19 19:22:17 +08:00
|
|
|
|
def sql_retrieval(self, sql, fetch_size=128, format="json"):
|
2024-11-12 14:59:41 +08:00
|
|
|
|
tbl = self.dataStore.sql(sql, fetch_size, format)
|
|
|
|
|
|
return tbl
|
|
|
|
|
|
|
2025-01-09 17:07:21 +08:00
|
|
|
|
def chunk_list(self, doc_id: str, tenant_id: str,
|
|
|
|
|
|
kb_ids: list[str], max_count=1024,
|
|
|
|
|
|
offset=0,
|
2025-10-09 12:36:19 +08:00
|
|
|
|
fields=["docnm_kwd", "content_with_weight", "img_id"],
|
|
|
|
|
|
sort_by_position: bool = False):
|
2024-11-12 14:59:41 +08:00
|
|
|
|
condition = {"doc_id": doc_id}
|
2025-10-09 12:36:19 +08:00
|
|
|
|
|
|
|
|
|
|
fields_set = set(fields or [])
|
|
|
|
|
|
if sort_by_position:
|
|
|
|
|
|
for need in ("page_num_int", "position_int", "top_int"):
|
|
|
|
|
|
if need not in fields_set:
|
|
|
|
|
|
fields_set.add(need)
|
|
|
|
|
|
fields = list(fields_set)
|
|
|
|
|
|
|
|
|
|
|
|
orderBy = OrderByExpr()
|
|
|
|
|
|
if sort_by_position:
|
|
|
|
|
|
orderBy.asc("page_num_int")
|
|
|
|
|
|
orderBy.asc("position_int")
|
|
|
|
|
|
orderBy.asc("top_int")
|
|
|
|
|
|
|
2024-12-23 12:12:15 +08:00
|
|
|
|
res = []
|
|
|
|
|
|
bs = 128
|
2025-01-09 17:07:21 +08:00
|
|
|
|
for p in range(offset, max_count, bs):
|
2026-03-02 14:02:36 +08:00
|
|
|
|
limit = min(bs, max_count - p)
|
|
|
|
|
|
if limit <= 0:
|
|
|
|
|
|
break
|
|
|
|
|
|
es_res = self.dataStore.search(fields, [], condition, [], orderBy, p, limit, index_name(tenant_id),
|
2025-01-09 17:07:21 +08:00
|
|
|
|
kb_ids)
|
2025-11-12 19:00:15 +08:00
|
|
|
|
dict_chunks = self.dataStore.get_fields(es_res, fields)
|
2025-01-22 19:43:14 +08:00
|
|
|
|
for id, doc in dict_chunks.items():
|
|
|
|
|
|
doc["id"] = id
|
2024-12-23 12:12:15 +08:00
|
|
|
|
if dict_chunks:
|
|
|
|
|
|
res.extend(dict_chunks.values())
|
2026-03-02 14:02:36 +08:00
|
|
|
|
chunk_count = len(dict_chunks)
|
|
|
|
|
|
if chunk_count == 0 or chunk_count < limit:
|
2024-12-23 12:12:15 +08:00
|
|
|
|
break
|
|
|
|
|
|
return res
|
2025-01-09 17:07:21 +08:00
|
|
|
|
|
|
|
|
|
|
def all_tags(self, tenant_id: str, kb_ids: list[str], S=1000):
|
2025-12-25 21:18:13 +08:00
|
|
|
|
if not self.dataStore.index_exist(index_name(tenant_id), kb_ids[0]):
|
2025-02-18 09:09:22 +08:00
|
|
|
|
return []
|
2025-01-09 17:07:21 +08:00
|
|
|
|
res = self.dataStore.search([], [], {}, [], OrderByExpr(), 0, 0, index_name(tenant_id), kb_ids, ["tag_kwd"])
|
2025-11-12 19:00:15 +08:00
|
|
|
|
return self.dataStore.get_aggregation(res, "tag_kwd")
|
2025-01-09 17:07:21 +08:00
|
|
|
|
|
|
|
|
|
|
def all_tags_in_portion(self, tenant_id: str, kb_ids: list[str], S=1000):
|
|
|
|
|
|
res = self.dataStore.search([], [], {}, [], OrderByExpr(), 0, 0, index_name(tenant_id), kb_ids, ["tag_kwd"])
|
2025-11-12 19:00:15 +08:00
|
|
|
|
res = self.dataStore.get_aggregation(res, "tag_kwd")
|
2025-01-09 17:07:21 +08:00
|
|
|
|
total = np.sum([c for _, c in res])
|
|
|
|
|
|
return {t: (c + 1) / (total + S) for t, c in res}
|
|
|
|
|
|
|
|
|
|
|
|
def tag_content(self, tenant_id: str, kb_ids: list[str], doc, all_tags, topn_tags=3, keywords_topn=30, S=1000):
|
|
|
|
|
|
idx_nm = index_name(tenant_id)
|
2025-12-29 12:01:18 +08:00
|
|
|
|
match_txt = self.qryr.paragraph(doc["title_tks"] + " " + doc["content_ltks"], doc.get("important_kwd", []),
|
|
|
|
|
|
keywords_topn)
|
2025-01-09 17:07:21 +08:00
|
|
|
|
res = self.dataStore.search([], [], {}, [match_txt], OrderByExpr(), 0, 0, idx_nm, kb_ids, ["tag_kwd"])
|
2025-11-12 19:00:15 +08:00
|
|
|
|
aggs = self.dataStore.get_aggregation(res, "tag_kwd")
|
2025-01-09 17:07:21 +08:00
|
|
|
|
if not aggs:
|
|
|
|
|
|
return False
|
|
|
|
|
|
cnt = np.sum([c for _, c in aggs])
|
2025-12-29 12:01:18 +08:00
|
|
|
|
tag_fea = sorted([(a, round(0.1 * (c + 1) / (cnt + S) / max(1e-6, all_tags.get(a, 0.0001)))) for a, c in aggs],
|
2025-01-09 17:07:21 +08:00
|
|
|
|
key=lambda x: x[1] * -1)[:topn_tags]
|
2025-03-24 10:45:29 +08:00
|
|
|
|
doc[TAG_FLD] = {a.replace(".", "_"): c for a, c in tag_fea if c > 0}
|
2025-01-09 17:07:21 +08:00
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
def tag_query(self, question: str, tenant_ids: str | list[str], kb_ids: list[str], all_tags, topn_tags=3, S=1000):
|
|
|
|
|
|
if isinstance(tenant_ids, str):
|
|
|
|
|
|
idx_nms = index_name(tenant_ids)
|
|
|
|
|
|
else:
|
|
|
|
|
|
idx_nms = [index_name(tid) for tid in tenant_ids]
|
|
|
|
|
|
match_txt, _ = self.qryr.question(question, min_match=0.0)
|
|
|
|
|
|
res = self.dataStore.search([], [], {}, [match_txt], OrderByExpr(), 0, 0, idx_nms, kb_ids, ["tag_kwd"])
|
2025-11-12 19:00:15 +08:00
|
|
|
|
aggs = self.dataStore.get_aggregation(res, "tag_kwd")
|
2025-01-09 17:07:21 +08:00
|
|
|
|
if not aggs:
|
|
|
|
|
|
return {}
|
|
|
|
|
|
cnt = np.sum([c for _, c in aggs])
|
2025-12-29 12:01:18 +08:00
|
|
|
|
tag_fea = sorted([(a, round(0.1 * (c + 1) / (cnt + S) / max(1e-6, all_tags.get(a, 0.0001)))) for a, c in aggs],
|
2025-01-09 17:07:21 +08:00
|
|
|
|
key=lambda x: x[1] * -1)[:topn_tags]
|
2025-03-24 10:45:29 +08:00
|
|
|
|
return {a.replace(".", "_"): max(1, c) for a, c in tag_fea}
|
2025-10-10 17:07:55 +08:00
|
|
|
|
|
2026-01-07 15:35:30 +08:00
|
|
|
|
async def retrieval_by_toc(self, query: str, chunks: list[dict], tenant_ids: list[str], chat_mdl, topn: int = 6):
|
Support operator constraints in semi-automatic metadata filtering (#12956)
### What problem does this PR solve?
#### Summary
This PR enhances the Semi-automatic metadata filtering mode by allowing
users to explicitly pre-define operators (e.g., contains, =, >, etc.)
for selected metadata keys. While the LLM still dynamically extracts the
filter value from the user's query, it is now strictly constrained to
use the operator specified in the UI configuration.
Using this feature is optional. By default the operator selection is set
to "automatic" resulting in the LLM choosing the operator (as
presently).
#### Rationale & Use Case
This enhancement was driven by a concrete challenge I encountered while
working with technical documentation.
In my specific use case, I was trying to filter for software versions
within a technical manual. In this dataset, a single document chunk
often applies to multiple software versions. These versions are stored
as a combined string within the metadata for each chunk.
When using the standard semi-automatic filter, the LLM would
inconsistently choose between the contains and equals operators. When it
chose equals, it would exclude every chunk that applied to more than one
version, even if the version I was searching for was clearly included in
that metadata string. This led to incomplete and frustrating retrieval
results.
By extending the semi-automatic filter to allow pre-defining the
operator for a specific key, I was able to force the use of contains for
the version field. This change immediately led to significantly improved
and more reliable results in my case.
I believe this functionality will be equally useful for others dealing
with "tagged" or multi-value metadata where the relationship between the
query and the field is known, but the specific value needs to remain
dynamic.
#### Key Changes
##### Backend & Core Logic
- `common/metadata_utils.py`: Updated apply_meta_data_filter to support
a mixed data structure for semi_auto (handling both legacy string arrays
and the new object-based format {"key": "...", "op": "..."}).
- `rag/prompts/generator.py`: Extended gen_meta_filter to accept and
pass operator constraints to the LLM.
- `rag/prompts/meta_filter.md`: Updated the system prompt to instruct
the LLM to strictly respect provided operator constraints.
##### Frontend
- `web/src/components/metadata-filter/metadata-semi-auto-fields.tsx`:
Enhanced the UI to include an operator dropdown for each selected
metadata key, utilizing existing operator constants.
- `web/src/components/metadata-filter/index.tsx`: Updated the validation
schema to accommodate the new state structure.
#### Test Plan
- Backward Compatibility: Verified that existing semi-auto filters
stored as simple strings still function correctly.
- Prompt Verification: Confirmed that constraints are correctly rendered
in the LLM system prompt when specified.
- Added unit tests as
`test/unit_test/common/test_apply_semi_auto_meta_data_filter.py`
- Manual End-to-End:
- Configured a "Semi-automatic" filter for a "Version" key with the
"contains" operator.
- Asked a version-specific query.
- Result
<img width="1173" height="704" alt="Screenshot 2026-02-02 145359"
src="https://github.com/user-attachments/assets/510a6a61-a231-4dc2-a7fe-cdfc07219132"
/>
### Type of change
- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
---------
Co-authored-by: Philipp Heyken Soares <philipp.heyken-soares@am.ai>
2026-02-03 04:11:34 +01:00
|
|
|
|
from rag.prompts.generator import relevant_chunks_with_toc # moved from the top of the file to avoid circular import
|
2025-10-10 17:07:55 +08:00
|
|
|
|
if not chunks:
|
|
|
|
|
|
return []
|
|
|
|
|
|
idx_nms = [index_name(tid) for tid in tenant_ids]
|
|
|
|
|
|
ranks, doc_id2kb_id = {}, {}
|
|
|
|
|
|
for ck in chunks:
|
|
|
|
|
|
if ck["doc_id"] not in ranks:
|
|
|
|
|
|
ranks[ck["doc_id"]] = 0
|
|
|
|
|
|
ranks[ck["doc_id"]] += ck["similarity"]
|
|
|
|
|
|
doc_id2kb_id[ck["doc_id"]] = ck["kb_id"]
|
2025-12-29 12:01:18 +08:00
|
|
|
|
doc_id = sorted(ranks.items(), key=lambda x: x[1] * -1.)[0][0]
|
2025-10-10 17:07:55 +08:00
|
|
|
|
kb_ids = [doc_id2kb_id[doc_id]]
|
2025-12-29 12:01:18 +08:00
|
|
|
|
es_res = self.dataStore.search(["content_with_weight"], [], {"doc_id": doc_id, "toc_kwd": "toc"}, [],
|
|
|
|
|
|
OrderByExpr(), 0, 128, idx_nms,
|
2025-10-10 17:07:55 +08:00
|
|
|
|
kb_ids)
|
|
|
|
|
|
toc = []
|
2025-11-12 19:00:15 +08:00
|
|
|
|
dict_chunks = self.dataStore.get_fields(es_res, ["content_with_weight"])
|
2025-10-10 17:07:55 +08:00
|
|
|
|
for _, doc in dict_chunks.items():
|
|
|
|
|
|
try:
|
|
|
|
|
|
toc.extend(json.loads(doc["content_with_weight"]))
|
|
|
|
|
|
except Exception as e:
|
|
|
|
|
|
logging.exception(e)
|
|
|
|
|
|
if not toc:
|
|
|
|
|
|
return chunks
|
|
|
|
|
|
|
2026-01-07 15:35:30 +08:00
|
|
|
|
ids = await relevant_chunks_with_toc(query, toc, chat_mdl, topn * 2)
|
2025-10-10 17:07:55 +08:00
|
|
|
|
if not ids:
|
|
|
|
|
|
return chunks
|
2025-11-20 15:35:09 +08:00
|
|
|
|
|
2025-10-10 17:07:55 +08:00
|
|
|
|
vector_size = 1024
|
|
|
|
|
|
id2idx = {ck["chunk_id"]: i for i, ck in enumerate(chunks)}
|
|
|
|
|
|
for cid, sim in ids:
|
|
|
|
|
|
if cid in id2idx:
|
|
|
|
|
|
chunks[id2idx[cid]]["similarity"] += sim
|
|
|
|
|
|
continue
|
2026-02-09 17:56:59 +08:00
|
|
|
|
chunk = self.dataStore.get(cid, idx_nms[0], kb_ids)
|
2025-12-25 14:06:20 +08:00
|
|
|
|
if not chunk:
|
|
|
|
|
|
continue
|
2025-10-10 17:07:55 +08:00
|
|
|
|
d = {
|
|
|
|
|
|
"chunk_id": cid,
|
|
|
|
|
|
"content_ltks": chunk["content_ltks"],
|
|
|
|
|
|
"content_with_weight": chunk["content_with_weight"],
|
|
|
|
|
|
"doc_id": doc_id,
|
|
|
|
|
|
"docnm_kwd": chunk.get("docnm_kwd", ""),
|
|
|
|
|
|
"kb_id": chunk["kb_id"],
|
|
|
|
|
|
"important_kwd": chunk.get("important_kwd", []),
|
|
|
|
|
|
"image_id": chunk.get("img_id", ""),
|
|
|
|
|
|
"similarity": sim,
|
|
|
|
|
|
"vector_similarity": sim,
|
|
|
|
|
|
"term_similarity": sim,
|
|
|
|
|
|
"vector": [0.0] * vector_size,
|
|
|
|
|
|
"positions": chunk.get("position_int", []),
|
|
|
|
|
|
"doc_type_kwd": chunk.get("doc_type_kwd", "")
|
|
|
|
|
|
}
|
|
|
|
|
|
for k in chunk.keys():
|
|
|
|
|
|
if k[-4:] == "_vec":
|
|
|
|
|
|
d["vector"] = chunk[k]
|
|
|
|
|
|
vector_size = len(chunk[k])
|
|
|
|
|
|
break
|
|
|
|
|
|
chunks.append(d)
|
|
|
|
|
|
|
2025-12-29 12:01:18 +08:00
|
|
|
|
return sorted(chunks, key=lambda x: x["similarity"] * -1)[:topn]
|
2025-12-01 14:03:09 +08:00
|
|
|
|
|
2025-12-29 12:01:18 +08:00
|
|
|
|
def retrieval_by_children(self, chunks: list[dict], tenant_ids: list[str]):
|
2025-12-01 14:03:09 +08:00
|
|
|
|
if not chunks:
|
|
|
|
|
|
return []
|
|
|
|
|
|
idx_nms = [index_name(tid) for tid in tenant_ids]
|
2025-12-01 16:54:57 +08:00
|
|
|
|
mom_chunks = defaultdict(list)
|
2025-12-01 14:03:09 +08:00
|
|
|
|
i = 0
|
|
|
|
|
|
while i < len(chunks):
|
|
|
|
|
|
ck = chunks[i]
|
2025-12-09 09:34:01 +08:00
|
|
|
|
mom_id = ck.get("mom_id")
|
|
|
|
|
|
if not isinstance(mom_id, str) or not mom_id.strip():
|
2025-12-01 14:03:09 +08:00
|
|
|
|
i += 1
|
|
|
|
|
|
continue
|
|
|
|
|
|
mom_chunks[ck["mom_id"]].append(chunks.pop(i))
|
|
|
|
|
|
|
|
|
|
|
|
if not mom_chunks:
|
|
|
|
|
|
return chunks
|
|
|
|
|
|
|
|
|
|
|
|
if not chunks:
|
|
|
|
|
|
chunks = []
|
|
|
|
|
|
|
|
|
|
|
|
vector_size = 1024
|
|
|
|
|
|
for id, cks in mom_chunks.items():
|
2026-02-09 17:56:59 +08:00
|
|
|
|
chunk = self.dataStore.get(id, idx_nms[0], [ck["kb_id"] for ck in cks])
|
2025-12-01 14:03:09 +08:00
|
|
|
|
d = {
|
|
|
|
|
|
"chunk_id": id,
|
|
|
|
|
|
"content_ltks": " ".join([ck["content_ltks"] for ck in cks]),
|
|
|
|
|
|
"content_with_weight": chunk["content_with_weight"],
|
|
|
|
|
|
"doc_id": chunk["doc_id"],
|
|
|
|
|
|
"docnm_kwd": chunk.get("docnm_kwd", ""),
|
|
|
|
|
|
"kb_id": chunk["kb_id"],
|
|
|
|
|
|
"important_kwd": [kwd for ck in cks for kwd in ck.get("important_kwd", [])],
|
|
|
|
|
|
"image_id": chunk.get("img_id", ""),
|
|
|
|
|
|
"similarity": np.mean([ck["similarity"] for ck in cks]),
|
|
|
|
|
|
"vector_similarity": np.mean([ck["similarity"] for ck in cks]),
|
|
|
|
|
|
"term_similarity": np.mean([ck["similarity"] for ck in cks]),
|
|
|
|
|
|
"vector": [0.0] * vector_size,
|
|
|
|
|
|
"positions": chunk.get("position_int", []),
|
|
|
|
|
|
"doc_type_kwd": chunk.get("doc_type_kwd", "")
|
|
|
|
|
|
}
|
|
|
|
|
|
for k in cks[0].keys():
|
|
|
|
|
|
if k[-4:] == "_vec":
|
|
|
|
|
|
d["vector"] = cks[0][k]
|
|
|
|
|
|
vector_size = len(cks[0][k])
|
|
|
|
|
|
break
|
|
|
|
|
|
chunks.append(d)
|
|
|
|
|
|
|
2025-12-29 12:01:18 +08:00
|
|
|
|
return sorted(chunks, key=lambda x: x["similarity"] * -1)
|