rag/nlp/search.py

#
#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
import json
import logging
import re
import math
from collections import OrderedDict, defaultdict
from dataclasses import dataclass

from rag.nlp import rag_tokenizer, query
import numpy as np
from common.doc_store.doc_store_base import MatchDenseExpr, FusionExpr, OrderByExpr, DocStoreConnection
from common.string_utils import remove_redundant_spaces
from common.float_utils import get_float
from common.constants import PAGERANK_FLD, TAG_FLD
from common.tag_feature_utils import parse_tag_features
from common import settings

from common.misc_utils import thread_pool_exec

def index_name(uid): return f"ragflow_{uid}"


class Dealer:
    def __init__(self, dataStore: DocStoreConnection):
        self.qryr = query.FulltextQueryer()
        self.dataStore = dataStore

    @dataclass
    class SearchResult:
        total: int
        ids: list[str]
        query_vector: list[float] | None = None
        field: dict | None = None
        highlight: dict | None = None
        aggregation: list | dict | None = None
        keywords: list[str] | None = None
        group_docs: list[list] | None = None

    async def get_vector(self, txt, emb_mdl, topk=10, similarity=0.1):
        qv, _ = await thread_pool_exec(emb_mdl.encode_queries, txt)
        shape = np.array(qv).shape
        if len(shape) > 1:
            raise Exception(
                f"Dealer.get_vector returned array's shape {shape} doesn't match expectation(exact one dimension).")
        embedding_data = [get_float(v) for v in qv]
        vector_column_name = f"q_{len(embedding_data)}_vec"
        return MatchDenseExpr(vector_column_name, embedding_data, 'float', 'cosine', topk, {"similarity": similarity})

    def get_filters(self, req):
        condition = dict()
        for key, field in {"kb_ids": "kb_id", "doc_ids": "doc_id"}.items():
            if key in req and req[key] is not None:
                condition[field] = req[key]
        # TODO(yzc): `available_int` is nullable however infinity doesn't support nullable columns.
        for key in ["knowledge_graph_kwd", "available_int", "entity_kwd", "from_entity_kwd", "to_entity_kwd",
                    "removed_kwd"]:
            if key in req and req[key] is not None:
                condition[key] = req[key]
        return condition

    async def search(self, req, idx_names: str | list[str],
               kb_ids: list[str],
               emb_mdl=None,
               highlight: bool | list | None = None,
               rank_feature: dict | None = None
               ):
        if highlight is None:
            highlight = False

        filters = self.get_filters(req)
        orderBy = OrderByExpr()

        pg = int(req.get("page", 1)) - 1
        topk = int(req.get("topk", 1024))
        ps = int(req.get("size", topk))
        offset, limit = pg * ps, ps

        src = req.get("fields",
                      ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
                       "doc_id", "chunk_order_int", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd",
                       "question_kwd", "question_tks", "doc_type_kwd",
                       "available_int", "content_with_weight", "mom_id", PAGERANK_FLD, TAG_FLD, "row_id()"])
        kwds = set([])

        qst = req.get("question", "")
        q_vec = []
        if not qst:
            if req.get("sort"):
                orderBy.asc("chunk_order_int")
                orderBy.asc("page_num_int")
                orderBy.asc("top_int")
                orderBy.desc("create_timestamp_flt")
            res = self.dataStore.search(src, [], filters, [], orderBy, offset, limit, idx_names, kb_ids)
            total = self.dataStore.get_total(res)
            logging.debug("Dealer.search TOTAL: {}".format(total))
        else:
            highlightFields = ["content_ltks", "title_tks"]
            if not highlight:
                highlightFields = []
            elif isinstance(highlight, list):
                highlightFields = highlight
            matchText, keywords = self.qryr.question(qst, min_match=0.3)
            if emb_mdl is None:
                matchExprs = [matchText]
                res = await thread_pool_exec(self.dataStore.search, src, highlightFields, filters, matchExprs, orderBy, offset, limit,
                                            idx_names, kb_ids, rank_feature=rank_feature)
                total = self.dataStore.get_total(res)
                logging.debug("Dealer.search TOTAL: {}".format(total))
            else:
                matchDense = await self.get_vector(qst, emb_mdl, topk, req.get("similarity", 0.1))
                q_vec = matchDense.embedding_data
                if not settings.DOC_ENGINE_INFINITY:
                    src.append(f"q_{len(q_vec)}_vec")

                fusionExpr = FusionExpr("weighted_sum", topk, {"weights": "0.05,0.95"})
                matchExprs = [matchText, matchDense, fusionExpr]

                res = await thread_pool_exec(self.dataStore.search, src, highlightFields, filters, matchExprs, orderBy, offset, limit,
                                            idx_names, kb_ids, rank_feature=rank_feature)
                total = self.dataStore.get_total(res)
                logging.debug("Dealer.search TOTAL: {}".format(total))

                # If result is empty, try again with lower min_match
                if total == 0:
                    if filters.get("doc_id"):
                        res = await thread_pool_exec(self.dataStore.search, src, [], filters, [], orderBy, offset, limit, idx_names, kb_ids)
                        total = self.dataStore.get_total(res)
                    else:
                        matchText, _ = self.qryr.question(qst, min_match=0.1)
                        matchDense.extra_options["similarity"] = 0.17
                        res = await thread_pool_exec(self.dataStore.search, src, highlightFields, filters, [matchText, matchDense, fusionExpr],
                                                    orderBy, offset, limit, idx_names, kb_ids,
                                                    rank_feature=rank_feature)
                        total = self.dataStore.get_total(res)
                    logging.debug("Dealer.search 2 TOTAL: {}".format(total))

            for k in keywords:
                kwds.add(k)
                for kk in rag_tokenizer.fine_grained_tokenize(k).split():
                    if len(kk) < 2:
                        continue
                    if kk in kwds:
                        continue
                    kwds.add(kk)

        logging.debug(f"TOTAL: {total}")
        ids = self.dataStore.get_doc_ids(res)
        keywords = list(kwds)
        highlight = self.dataStore.get_highlight(res, keywords, "content_with_weight")
        aggs = self.dataStore.get_aggregation(res, "docnm_kwd")
        return self.SearchResult(
            total=total,
            ids=ids,
            query_vector=q_vec,
            aggregation=aggs,
            highlight=highlight,
            field=self.dataStore.get_fields(res, src + ["_score"]),
            keywords=keywords
        )

    @staticmethod
    def trans2floats(txt):
        return [get_float(t) for t in txt.split("\t")]

    def insert_citations(self, answer, chunks, chunk_v,
                         embd_mdl, tkweight=0.1, vtweight=0.9):
        assert len(chunks) == len(chunk_v)
        if not chunks:
            return answer, set([])
        pieces = re.split(r"(```)", answer)
        if len(pieces) >= 3:
            i = 0
            pieces_ = []
            while i < len(pieces):
                if pieces[i] == "```":
                    st = i
                    i += 1
                    while i < len(pieces) and pieces[i] != "```":
                        i += 1
                    if i < len(pieces):
                        i += 1
                    pieces_.append("".join(pieces[st: i]) + "\n")
                else:
                    # Sentence boundary regex includes Arabic punctuation (، ؛ ؟ ۔)
                    pieces_.extend(
                        re.split(
                            r"([^\|][；。？!！،؛؟۔\n]|[a-z\u0600-\u06FF][.?;!،؛؟][ \n])",
                            pieces[i]))
                    i += 1
            pieces = pieces_
        else:
            # Sentence boundary regex includes Arabic punctuation (، ؛ ؟ ۔)
            pieces = re.split(r"([^\|][；。？!！،؛؟۔\n]|[a-z\u0600-\u06FF][.?;!،؛؟][ \n])", answer)
        for i in range(1, len(pieces)):
            if re.match(r"([^\|][；。？!！،؛؟۔\n]|[a-z\u0600-\u06FF][.?;!،؛؟][ \n])", pieces[i]):
                pieces[i - 1] += pieces[i][0]
                pieces[i] = pieces[i][1:]
        idx = []
        pieces_ = []
        for i, t in enumerate(pieces):
            if len(t) < 5:
                continue
            idx.append(i)
            pieces_.append(t)
        logging.debug("{} => {}".format(answer, pieces_))
        if not pieces_:
            return answer, set([])

        ans_v, _ = embd_mdl.encode(pieces_)
        for i in range(len(chunk_v)):
            if len(ans_v[0]) != len(chunk_v[i]):
                chunk_v[i] = [0.0] * len(ans_v[0])
                logging.warning(
                    "The dimension of query and chunk do not match: {} vs. {}".format(len(ans_v[0]), len(chunk_v[i])))

        assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
            len(ans_v[0]), len(chunk_v[0]))

        chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split()
                      for ck in chunks]
        cites = {}
        thr = 0.63
        while thr > 0.3 and len(cites.keys()) == 0 and pieces_ and chunks_tks:
            for i, a in enumerate(pieces_):
                sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
                                                                chunk_v,
                                                                rag_tokenizer.tokenize(
                                                                    self.qryr.rmWWW(pieces_[i])).split(),
                                                                chunks_tks,
                                                                tkweight, vtweight)
                mx = np.max(sim) * 0.99
                logging.debug("{} SIM: {}".format(pieces_[i], mx))
                if mx < thr:
                    continue
                cites[idx[i]] = list(
                    set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4]
            thr *= 0.8

        res = ""
        seted = set([])
        for i, p in enumerate(pieces):
            res += p
            if i not in idx:
                continue
            if i not in cites:
                continue
            for c in cites[i]:
                assert int(c) < len(chunk_v)
            for c in cites[i]:
                if c in seted:
                    continue
                res += f" [ID:{c}]"
                seted.add(c)

        return res, seted

    def _rank_feature_scores(self, query_rfea, search_res):
        ## For rank feature(tag_fea) scores.
        rank_fea = []
        pageranks = []
        for chunk_id in search_res.ids:
            pageranks.append(search_res.field[chunk_id].get(PAGERANK_FLD, 0))
        pageranks = np.array(pageranks, dtype=float)

        if not query_rfea:
            return np.array([0 for _ in range(len(search_res.ids))]) + pageranks

        q_denor = np.sqrt(np.sum([s * s for t, s in query_rfea.items() if t != PAGERANK_FLD]))
        if q_denor == 0:
            return np.array([0 for _ in range(len(search_res.ids))]) + pageranks
        for i in search_res.ids:
            nor, denor = 0, 0
            if not search_res.field[i].get(TAG_FLD):
                rank_fea.append(0)
                continue
            tag_feas = parse_tag_features(search_res.field[i].get(TAG_FLD), allow_json_string=True, allow_python_literal=True)
            if not tag_feas:
                rank_fea.append(0)
                continue
            for t, sc in tag_feas.items():
                if t in query_rfea:
                    nor += query_rfea[t] * sc
                denor += sc * sc
            if denor == 0:
                rank_fea.append(0)
            else:
                rank_fea.append(nor / np.sqrt(denor) / q_denor)
        return np.array(rank_fea) * 10. + pageranks

    def rerank(self, sres, query, tkweight=0.3,
               vtweight=0.7, cfield="content_ltks",
               rank_feature: dict | None = None
               ):
        _, keywords = self.qryr.question(query)
        vector_size = len(sres.query_vector)
        vector_column = f"q_{vector_size}_vec"
        zero_vector = [0.0] * vector_size
        ins_embd = []
        for chunk_id in sres.ids:
            vector = sres.field[chunk_id].get(vector_column, zero_vector)
            if isinstance(vector, str):
                vector = [get_float(v) for v in vector.split("\t")]
            ins_embd.append(vector)
        if not ins_embd:
            return [], [], []

        for i in sres.ids:
            if isinstance(sres.field[i].get("important_kwd", []), str):
                sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
        ins_tw = []
        for i in sres.ids:
            content_ltks = list(OrderedDict.fromkeys(sres.field[i][cfield].split()))
            title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
            question_tks = [t for t in sres.field[i].get("question_tks", "").split() if t]
            important_kwd = sres.field[i].get("important_kwd", [])
            tks = content_ltks + title_tks * 2 + important_kwd * 5 + question_tks * 6
            ins_tw.append(tks)

        ## For rank feature(tag_fea) scores.
        rank_fea = self._rank_feature_scores(rank_feature, sres)

        sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
                                                        ins_embd,
                                                        keywords,
                                                        ins_tw, tkweight, vtweight)

        return sim + rank_fea, tksim, vtsim

    def rerank_by_model(self, rerank_mdl, sres, query, tkweight=0.3,
                        vtweight=0.7, cfield="content_ltks",
                        rank_feature: dict | None = None):
        _, keywords = self.qryr.question(query)

        for i in sres.ids:
            if isinstance(sres.field[i].get("important_kwd", []), str):
                sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
        ins_tw = []
        for i in sres.ids:
            content_ltks = sres.field[i][cfield].split()
            title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
            important_kwd = sres.field[i].get("important_kwd", [])
            tks = content_ltks + title_tks + important_kwd
            ins_tw.append(tks)

        tksim = self.qryr.token_similarity(keywords, ins_tw)
        vtsim, _ = rerank_mdl.similarity(query, [remove_redundant_spaces(" ".join(tks)) for tks in ins_tw])
        ## For rank feature(tag_fea) scores.
        rank_fea = self._rank_feature_scores(rank_feature, sres)

        return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea, tksim, vtsim

    def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
        return self.qryr.hybrid_similarity(ans_embd,
                                           ins_embd,
                                           rag_tokenizer.tokenize(ans).split(),
                                           rag_tokenizer.tokenize(inst).split())

    async def retrieval(
            self,
            question,
            embd_mdl,
            tenant_ids,
            kb_ids,
            page,
            page_size,
            similarity_threshold=0.2,
            vector_similarity_weight=0.3,
            top=1024,
            doc_ids=None,
            aggs=True,
            rerank_mdl=None,
            highlight=False,
            rank_feature: dict | None = {PAGERANK_FLD: 10},
    ):
        ranks = {"total": 0, "chunks": [], "doc_aggs": {}}
        if not question:
            return ranks

        # Keep the historical windowing strategy by default, but when an external
        # reranker is enabled cap candidate count by both top_k and provider-safe 64.
        RERANK_LIMIT = math.ceil(64 / page_size) * page_size if page_size > 1 else 1
        RERANK_LIMIT = max(30, RERANK_LIMIT)
        if rerank_mdl and top > 0:
            RERANK_LIMIT = min(RERANK_LIMIT, top, 64)
        page = max(page, 1)
        global_offset = (page - 1) * page_size
        req = {
            "kb_ids": kb_ids,
            "doc_ids": doc_ids,
            "page": global_offset // RERANK_LIMIT + 1,
            "size": RERANK_LIMIT,
            "question": question,
            "vector": True,
            "topk": top,
            "similarity": similarity_threshold,
            "available_int": 1,
        }

        if isinstance(tenant_ids, str):
            tenant_ids = tenant_ids.split(",")

        sres = await self.search(req, [index_name(tid) for tid in tenant_ids], kb_ids, embd_mdl, highlight,
                           rank_feature=rank_feature)

        if rerank_mdl and sres.total > 0:
            sim, tsim, vsim = self.rerank_by_model(
                rerank_mdl,
                sres,
                question,
                1 - vector_similarity_weight,
                vector_similarity_weight,
                rank_feature=rank_feature,
            )
        else:
            if settings.DOC_ENGINE_INFINITY:
                # Don't need rerank here since Infinity normalizes each way score before fusion.
                sim = [sres.field[id].get("_score", 0.0) for id in sres.ids]
                sim = [s if s is not None else 0.0 for s in sim]
                tsim = sim
                vsim = sim
            else:
                # ElasticSearch doesn't normalize each way score before fusion.
                sim, tsim, vsim = self.rerank(
                    sres,
                    question,
                    1 - vector_similarity_weight,
                    vector_similarity_weight,
                    rank_feature=rank_feature,
                )

        sim_np = np.array(sim, dtype=np.float64)
        if sim_np.size == 0:
            ranks["doc_aggs"] = []
            return ranks

        sorted_idx = np.argsort(sim_np * -1)

        # When vector_similarity_weight is 0, similarity_threshold is not meaningful for term-only scores.
        post_threshold = 0.0 if vector_similarity_weight <= 0 else similarity_threshold

        # When doc_ids is explicitly provided (metadata or document filtering), bypass threshold
        # User wants those specific documents regardless of their relevance score
        if doc_ids:
            post_threshold = 0.0

        valid_idx = [int(i) for i in sorted_idx if sim_np[i] >= post_threshold]
        filtered_count = len(valid_idx)
        ranks["total"] = int(filtered_count)

        if filtered_count == 0:
            ranks["doc_aggs"] = []
            return ranks

        begin = global_offset % RERANK_LIMIT
        end = begin + page_size
        page_idx = valid_idx[begin:end]

        dim = len(sres.query_vector)
        vector_column = f"q_{dim}_vec"
        zero_vector = [0.0] * dim

        for i in page_idx:
            id = sres.ids[i]
            chunk = sres.field[id]
            dnm = chunk.get("docnm_kwd", "")
            did = chunk.get("doc_id", "")

            position_int = chunk.get("position_int", [])
            d = {
                "chunk_id": id,
                "content_ltks": chunk["content_ltks"],
                "content_with_weight": chunk["content_with_weight"],
                "doc_id": did,
                "docnm_kwd": dnm,
                "kb_id": chunk["kb_id"],
                "important_kwd": chunk.get("important_kwd", []),
                "tag_kwd": chunk.get("tag_kwd", []),
                "image_id": chunk.get("img_id", ""),
                "similarity": float(sim_np[i]),
                "vector_similarity": float(vsim[i]),
                "term_similarity": float(tsim[i]),
                "vector": chunk.get(vector_column, zero_vector),
                "positions": position_int,
                "doc_type_kwd": chunk.get("doc_type_kwd", ""),
                "mom_id": chunk.get("mom_id", ""),
                "row_id": chunk.get("row_id()"),
            }
            if highlight and sres.highlight:
                if id in sres.highlight:
                    d["highlight"] = remove_redundant_spaces(sres.highlight[id])
                else:
                    d["highlight"] = d["content_with_weight"]
            ranks["chunks"].append(d)

        if aggs:
            for i in valid_idx:
                id = sres.ids[i]
                chunk = sres.field[id]
                dnm = chunk.get("docnm_kwd", "")
                did = chunk.get("doc_id", "")
                if dnm not in ranks["doc_aggs"]:
                    ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
                ranks["doc_aggs"][dnm]["count"] += 1

            ranks["doc_aggs"] = [
                {
                    "doc_name": k,
                    "doc_id": v["doc_id"],
                    "count": v["count"],
                }
                for k, v in sorted(
                    ranks["doc_aggs"].items(),
                    key=lambda x: x[1]["count"] * -1,
                )
            ]
        else:
            ranks["doc_aggs"] = []

        return ranks

    def sql_retrieval(self, sql, fetch_size=128, format="json"):
        tbl = self.dataStore.sql(sql, fetch_size, format)
        return tbl

    def chunk_list(self, doc_id: str, tenant_id: str,
                   kb_ids: list[str], max_count=1024,
                   offset=0,
                   fields=["docnm_kwd", "content_with_weight", "img_id"],
                   sort_by_position: bool = False):
        condition = {"doc_id": doc_id}

        fields_set = set(fields or [])
        if sort_by_position:
            for need in ("page_num_int", "position_int", "top_int"):
                if need not in fields_set:
                    fields_set.add(need)
        fields = list(fields_set)

        orderBy = OrderByExpr()
        if sort_by_position:
            orderBy.asc("page_num_int")
            orderBy.asc("position_int")
            orderBy.asc("top_int")

        res = []
        bs = 128
        for p in range(offset, max_count, bs):
            limit = min(bs, max_count - p)
            if limit <= 0:
                break
            es_res = self.dataStore.search(fields, [], condition, [], orderBy, p, limit, index_name(tenant_id),
                                           kb_ids)
            dict_chunks = self.dataStore.get_fields(es_res, fields)
            for id, doc in dict_chunks.items():
                doc["id"] = id
            if dict_chunks:
                res.extend(dict_chunks.values())
            chunk_count = len(dict_chunks)
            if chunk_count == 0 or chunk_count < limit:
                break
        return res

    def all_tags(self, tenant_id: str, kb_ids: list[str], S=1000):
        if not self.dataStore.index_exist(index_name(tenant_id), kb_ids[0]):
            return []
        res = self.dataStore.search([], [], {}, [], OrderByExpr(), 0, 0, index_name(tenant_id), kb_ids, ["tag_kwd"])
        return self.dataStore.get_aggregation(res, "tag_kwd")

    def all_tags_in_portion(self, tenant_id: str, kb_ids: list[str], S=1000):
        res = self.dataStore.search([], [], {}, [], OrderByExpr(), 0, 0, index_name(tenant_id), kb_ids, ["tag_kwd"])
        res = self.dataStore.get_aggregation(res, "tag_kwd")
        total = np.sum([c for _, c in res])
        return {t: (c + 1) / (total + S) for t, c in res}

    def tag_content(self, tenant_id: str, kb_ids: list[str], doc, all_tags, topn_tags=3, keywords_topn=30, S=1000):
        idx_nm = index_name(tenant_id)
        match_txt = self.qryr.paragraph(doc["title_tks"] + " " + doc["content_ltks"], doc.get("important_kwd", []),
                                        keywords_topn)
        res = self.dataStore.search([], [], {}, [match_txt], OrderByExpr(), 0, 0, idx_nm, kb_ids, ["tag_kwd"])
        aggs = self.dataStore.get_aggregation(res, "tag_kwd")
        if not aggs:
            return False
        cnt = np.sum([c for _, c in aggs])
        tag_fea = sorted([(a, round(0.1 * (c + 1) / (cnt + S) / max(1e-6, all_tags.get(a, 0.0001)))) for a, c in aggs],
                         key=lambda x: x[1] * -1)[:topn_tags]
        doc[TAG_FLD] = {a.replace(".", "_"): c for a, c in tag_fea if c > 0}
        return True

    def tag_query(self, question: str, tenant_ids: str | list[str], kb_ids: list[str], all_tags, topn_tags=3, S=1000):
        if isinstance(tenant_ids, str):
            idx_nms = index_name(tenant_ids)
        else:
            idx_nms = [index_name(tid) for tid in tenant_ids]
        match_txt, _ = self.qryr.question(question, min_match=0.0)
        res = self.dataStore.search([], [], {}, [match_txt], OrderByExpr(), 0, 0, idx_nms, kb_ids, ["tag_kwd"])
        aggs = self.dataStore.get_aggregation(res, "tag_kwd")
        if not aggs:
            return {}
        cnt = np.sum([c for _, c in aggs])
        tag_fea = sorted([(a, round(0.1 * (c + 1) / (cnt + S) / max(1e-6, all_tags.get(a, 0.0001)))) for a, c in aggs],
                         key=lambda x: x[1] * -1)[:topn_tags]
        return {a.replace(".", "_"): max(1, c) for a, c in tag_fea}

    async def retrieval_by_toc(self, query: str, chunks: list[dict], tenant_ids: list[str], chat_mdl, topn: int = 6):
        from rag.prompts.generator import relevant_chunks_with_toc # moved from the top of the file to avoid circular import
        if not chunks:
            return []
        idx_nms = [index_name(tid) for tid in tenant_ids]
        ranks, doc_id2kb_id = {}, {}
        for ck in chunks:
            if ck["doc_id"] not in ranks:
                ranks[ck["doc_id"]] = 0
            ranks[ck["doc_id"]] += ck["similarity"]
            doc_id2kb_id[ck["doc_id"]] = ck["kb_id"]
        doc_id = sorted(ranks.items(), key=lambda x: x[1] * -1.)[0][0]
        kb_ids = [doc_id2kb_id[doc_id]]
        es_res = self.dataStore.search(["content_with_weight"], [], {"doc_id": doc_id, "toc_kwd": "toc"}, [],
                                       OrderByExpr(), 0, 128, idx_nms,
                                       kb_ids)
        toc = []
        dict_chunks = self.dataStore.get_fields(es_res, ["content_with_weight"])
        for _, doc in dict_chunks.items():
            try:
                toc.extend(json.loads(doc["content_with_weight"]))
            except Exception as e:
                logging.exception(e)
        if not toc:
            return chunks

        ids = await relevant_chunks_with_toc(query, toc, chat_mdl, topn * 2)
        if not ids:
            return chunks

        vector_size = 1024
        id2idx = {ck["chunk_id"]: i for i, ck in enumerate(chunks)}
        for cid, sim in ids:
            if cid in id2idx:
                chunks[id2idx[cid]]["similarity"] += sim
                continue
            chunk = self.dataStore.get(cid, idx_nms[0], kb_ids)
            if not chunk:
                continue
            d = {
                "chunk_id": cid,
                "content_ltks": chunk["content_ltks"],
                "content_with_weight": chunk["content_with_weight"],
                "doc_id": doc_id,
                "docnm_kwd": chunk.get("docnm_kwd", ""),
                "kb_id": chunk["kb_id"],
                "important_kwd": chunk.get("important_kwd", []),
                "image_id": chunk.get("img_id", ""),
                "similarity": sim,
                "vector_similarity": sim,
                "term_similarity": sim,
                "vector": [0.0] * vector_size,
                "positions": chunk.get("position_int", []),
                "doc_type_kwd": chunk.get("doc_type_kwd", "")
            }
            for k in chunk.keys():
                if k[-4:] == "_vec":
                    d["vector"] = chunk[k]
                    vector_size = len(chunk[k])
                    break
            chunks.append(d)

        return sorted(chunks, key=lambda x: x["similarity"] * -1)[:topn]

    def retrieval_by_children(self, chunks: list[dict], tenant_ids: list[str]):
        if not chunks:
            return []
        idx_nms = [index_name(tid) for tid in tenant_ids]
        mom_chunks = defaultdict(list)
        i = 0
        while i < len(chunks):
            ck = chunks[i]
            mom_id = ck.get("mom_id")
            if not isinstance(mom_id, str) or not mom_id.strip():
                i += 1
                continue
            mom_chunks[ck["mom_id"]].append(chunks.pop(i))

        if not mom_chunks:
            return chunks

        if not chunks:
            chunks = []

        vector_size = 1024
        for id, cks in mom_chunks.items():
            chunk = self.dataStore.get(id, idx_nms[0], [ck["kb_id"] for ck in cks])
            d = {
                "chunk_id": id,
                "content_ltks": " ".join([ck["content_ltks"] for ck in cks]),
                "content_with_weight": chunk["content_with_weight"],
                "doc_id": chunk["doc_id"],
                "docnm_kwd": chunk.get("docnm_kwd", ""),
                "kb_id": chunk["kb_id"],
                "important_kwd": [kwd for ck in cks for kwd in ck.get("important_kwd", [])],
                "image_id": chunk.get("img_id", ""),
                "similarity": np.mean([ck["similarity"] for ck in cks]),
                "vector_similarity": np.mean([ck["similarity"] for ck in cks]),
                "term_similarity": np.mean([ck["similarity"] for ck in cks]),
                "vector": [0.0] * vector_size,
                "positions": chunk.get("position_int", []),
                "doc_type_kwd": chunk.get("doc_type_kwd", "")
            }
            for k in cks[0].keys():
                if k[-4:] == "_vec":
                    d["vector"] = cks[0][k]
                    vector_size = len(cks[0][k])
                    break
            chunks.append(d)

        return sorted(chunks, key=lambda x: x["similarity"] * -1)
-												Update info (#1005)

### What problem does this PR solve?

_Briefly describe what this PR aims to solve. Include background context
that will help reviewers understand the purpose of the PR._

### Type of change

- [x] Refactoring

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2024-05-31 09:53:04 +08:00
+								#
 								#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
 								#
 								#  Licensed under the Apache License, Version 2.0 (the "License");
 								#  you may not use this file except in compliance with the License.
 								#  You may obtain a copy of the License at
 								#
 								#      http://www.apache.org/licenses/LICENSE-2.0
 								#
 								#  Unless required by applicable law or agreed to in writing, software
 								#  distributed under the License is distributed on an "AS IS" BASIS,
 								#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								#  See the License for the specific language governing permissions and
 								#  limitations under the License.
 								#
-												Feat: TOC retrieval (#10456)

### What problem does this PR solve?

#10436

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-10-10 17:07:55 +08:00
+								import json
-												Use consistent log file names, introduced initLogger (#3403)

### What problem does this PR solve?

Use consistent log file names, introduced initLogger

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2024-11-14 17:13:48 +08:00
+								import logging
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
+								import re
-												Fix: fix retrieval tesing wrong pagination (#7174)

### What problem does this PR solve?

Fix retrieval testing wrong pagination. #7171 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2025-04-22 15:16:04 +08:00
+								import math
-												Feat: support parent-child in search procedure. (#11629)

### What problem does this PR solve?

#7996

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-01 14:03:09 +08:00
+								from collections import OrderedDict, defaultdict
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
+								from dataclasses import dataclass
-												change licence (#28)

* add front end code

* change licence
											
										
										
											2024-01-17 09:39:50 +08:00
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								from rag.nlp import rag_tokenizer, query
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
+								import numpy as np
-												Feat: message manage (#12196)

### What problem does this PR solve?

Manage message and use in agent.

Issue #4213 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-25 21:18:13 +08:00
+								from common.doc_store.doc_store_base import MatchDenseExpr, FusionExpr, OrderByExpr, DocStoreConnection
-												Refactor: rename rmSpace to remove_redundant_spaces (#10796)

### What problem does this PR solve?

- rename rmSpace to remove_redundant_spaces
- move clean_markdown_block to common module
- add unit tests for remove_redundant_spaces and clean_markdown_block

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-10-28 09:46:32 +08:00
+								from common.string_utils import remove_redundant_spaces
 								from common.float_utils import get_float
-												Move api.settings to common.settings (#11036)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-06 09:36:38 +08:00
+								from common.constants import PAGERANK_FLD, TAG_FLD
-												Fix tag_feas code injection in retrieval ranking (#13923)

## Summary
- remove eval-based parsing from retrieval rank feature scoring
- validate `tag_feas` at write time in chunk APIs and SDK routes
- add regression tests for safe parsing and malicious payload rejection

## Details
`tag_feas` is intended to be structured rank-feature data, but the
retrieval ranking path was evaluating stored values as Python
expressions. This change treats `tag_feas` strictly as data.

### What changed
- replace `eval()` in `rag/nlp/search.py` with safe parsing via
`json.loads()` and optional `ast.literal_eval()` compatibility for
legacy Python-dict strings
- strictly filter parsed values down to `dict[str, finite number]`
- reject invalid `tag_feas` payloads at write time in web chunk routes
and SDK document chunk routes
- add focused regression tests to prove executable strings are ignored
and invalid payloads are rejected

## Validation
- `python -m pytest test/unit_test/common/test_tag_feature_utils.py
test/unit_test/rag/test_rank_feature_scores.py -q`

---------

Co-authored-by: unknown <zhenglinkai@CCN.Local>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
											
										
										
											2026-04-15 16:31:11 +08:00
+								from common.tag_feature_utils import parse_tag_features
-												Use Infinity single-field-multi-index (#11444)

### What problem does this PR solve?

Use Infinity single-field-multi-index

### Type of change

- [x] Refactoring
- [x] Performance Improvement
											
										
										
											2025-11-26 11:06:37 +08:00
+								from common import settings
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
-												Refa: asyncio.to_thread to ThreadPoolExecutor to break thread limitat… (#12716)

### Type of change

- [x] Refactoring
											
										
										
											2026-01-20 13:29:37 +08:00
+								from common.misc_utils import thread_pool_exec
-												add llm API (#19)

* add llm API

* refine llm API
											
										
										
											2023-12-28 13:50:13 +08:00
-												add alot of api (#23)

* clean rust version project

* clean rust version project

* build python version rag-flow

* add alot of api
											
										
										
											2024-01-15 19:47:25 +08:00
+								def index_name(uid): return f"ragflow_{uid}"
-												add llm API (#19)

* add llm API

* refine llm API
											
										
										
											2023-12-28 13:50:13 +08:00
-												build dialog server; add thumbnail to docinfo; (#17)


											
										
										
											2023-12-26 19:32:06 +08:00
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
+								class Dealer:
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								    def __init__(self, dataStore: DocStoreConnection):
 								        self.qryr = query.FulltextQueryer()
 								        self.dataStore = dataStore
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
 								    @dataclass
 								    class SearchResult:
-												add llm API (#19)

* add llm API

* refine llm API
											
										
										
											2023-12-28 13:50:13 +08:00
+								        total: int
-												Introduced beartype (#3460)

### What problem does this PR solve?

Introduced [beartype](https://github.com/beartype/beartype) for runtime
type-checking.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-11-18 17:38:17 +08:00
+								        ids: list[str]
 								        query_vector: list[float] | None = None
 								        field: dict | None = None
 								        highlight: dict | None = None
 								        aggregation: list | dict | None = None
 								        keywords: list[str] | None = None
 								        group_docs: list[list] | None = None
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
-												Refa: async retrieval process. (#12629)

### Type of change

- [x] Refactoring
- [x] Performance Improvement
											
										
										
											2026-01-15 12:28:49 +08:00
+								    async def get_vector(self, txt, emb_mdl, topk=10, similarity=0.1):
-												Refa: asyncio.to_thread to ThreadPoolExecutor to break thread limitat… (#12716)

### Type of change

- [x] Refactoring
											
										
										
											2026-01-20 13:29:37 +08:00
+								        qv, _ = await thread_pool_exec(emb_mdl.encode_queries, txt)
-												Detect shape error of embedding (#3710)

### What problem does this PR solve?

Detect shape error of embedding. Close #2997

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-11-28 14:10:22 +08:00
+								        shape = np.array(qv).shape
 								        if len(shape) > 1:
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								            raise Exception(
 								                f"Dealer.get_vector returned array's shape {shape} doesn't match expectation(exact one dimension).")
-												Fix: float transfer exception. (#6197)

### What problem does this PR solve?

#6177

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-03-18 11:13:44 +08:00
+								        embedding_data = [get_float(v) for v in qv]
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								        vector_column_name = f"q_{len(embedding_data)}_vec"
 								        return MatchDenseExpr(vector_column_name, embedding_data, 'float', 'cosine', topk, {"similarity": similarity})
 								    def get_filters(self, req):
 								        condition = dict()
 								        for key, field in {"kb_ids": "kb_id", "doc_ids": "doc_id"}.items():
 								            if key in req and req[key] is not None:
 								                condition[field] = req[key]
 								        # TODO(yzc): `available_int` is nullable however infinity doesn't support nullable columns.
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								        for key in ["knowledge_graph_kwd", "available_int", "entity_kwd", "from_entity_kwd", "to_entity_kwd",
 								                    "removed_kwd"]:
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								            if key in req and req[key] is not None:
 								                condition[key] = req[key]
 								        return condition
-												Refa: async retrieval process. (#12629)

### Type of change

- [x] Refactoring
- [x] Performance Improvement
											
										
										
											2026-01-15 12:28:49 +08:00
+								    async def search(self, req, idx_names: str | list[str],
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								               kb_ids: list[str],
 								               emb_mdl=None,
-												Fix:ERROR 20 Method rag.nlp.search.Dealer.search() parameter highlight="None" violates type hint bool | list, as <class "builtins.NoneType"> "None" not list or bool. (#10743)

### What problem does this PR solve?

https://github.com/infiniflow/ragflow/issues/10733

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-10-27 09:29:39 +08:00
+								               highlight: bool | list | None = None,
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								               rank_feature: dict | None = None
 								               ):
-												Fix:ERROR 20 Method rag.nlp.search.Dealer.search() parameter highlight="None" violates type hint bool | list, as <class "builtins.NoneType"> "None" not list or bool. (#10743)

### What problem does this PR solve?

https://github.com/infiniflow/ragflow/issues/10733

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-10-27 09:29:39 +08:00
+								        if highlight is None:
 								            highlight = False
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								        filters = self.get_filters(req)
 								        orderBy = OrderByExpr()
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
-												add llm API (#19)

* add llm API

* refine llm API
											
										
										
											2023-12-28 13:50:13 +08:00
+								        pg = int(req.get("page", 1)) - 1
-												add use layout or not option (#145)

* add use layout or not option

* trival
											
										
										
											2024-03-22 19:21:09 +08:00
+								        topk = int(req.get("topk", 1024))
-												add rerank model (#969)

### What problem does this PR solve?

feat: add rerank models to the project #724 #162

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-05-29 16:50:02 +08:00
+								        ps = int(req.get("size", topk))
-												Fetch chunk by batches. (#4177)

### What problem does this PR solve?

#4173

### Type of change

- [x] Performance Improvement
											
										
										
											2024-12-23 12:12:15 +08:00
+								        offset, limit = pg * ps, ps
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								        src = req.get("fields",
 								                      ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
-												Feat: Refact pipeline (#13826)

### What problem does this PR solve?

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring

---------

Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
											
										
										
											2026-04-03 19:26:45 +08:00
+								                       "doc_id", "chunk_order_int", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd",
-												Feat: add image preview to retrieval test. (#7610)

### What problem does this PR solve?

#7608

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-05-13 14:30:36 +08:00
+								                       "question_kwd", "question_tks", "doc_type_kwd",
-												Get ROW_ID from search() in Infinity (#13901)

### What problem does this PR solve?

1. Search() in Infinity can return row_id now

2.  To Get ROW_ID from search(), refer to handling of retrieval_test.

example
```
$ curl -s -X POST "http://localhost:$PORT/v1/chunk/retrieval_test" -H "Authorization: $TOKEN" -H "Content-Type: application/json" -d '{"kb_id": "4fcd01582ca911f1954184ba59049aa3", "question": "曹操"}'
```


### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-04-02 18:56:43 +08:00
+								                       "available_int", "content_with_weight", "mom_id", PAGERANK_FLD, TAG_FLD, "row_id()"])
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								        kwds = set([])
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								        qst = req.get("question", "")
 								        q_vec = []
-												add llm API (#19)

* add llm API

* refine llm API
											
										
										
											2023-12-28 13:50:13 +08:00
+								        if not qst:
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								            if req.get("sort"):
-												Feat: Refact pipeline (#13826)

### What problem does this PR solve?

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring

---------

Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
											
										
										
											2026-04-03 19:26:45 +08:00
+								                orderBy.asc("chunk_order_int")
-												Rename page_num_list, top_list, position_list (#3940)

### What problem does this PR solve?

Rename page_num_list, top_list, position_list to page_num_int, top_int,
position_int

### Type of change

- [x] Refactoring
											
										
										
											2024-12-10 16:32:58 +08:00
+								                orderBy.asc("page_num_int")
 								                orderBy.asc("top_int")
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								                orderBy.desc("create_timestamp_flt")
 								            res = self.dataStore.search(src, [], filters, [], orderBy, offset, limit, idx_names, kb_ids)
-												Refactor function name (#11210)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-12 19:00:15 +08:00
+								            total = self.dataStore.get_total(res)
-												Use consistent log file names, introduced initLogger (#3403)

### What problem does this PR solve?

Use consistent log file names, introduced initLogger

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2024-11-14 17:13:48 +08:00
+								            logging.debug("Dealer.search TOTAL: {}".format(total))
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								        else:
-												Fix: search highlight. (#10616)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-10-16 18:45:43 +08:00
+								            highlightFields = ["content_ltks", "title_tks"]
 								            if not highlight:
 								                highlightFields = []
 								            elif isinstance(highlight, list):
 								                highlightFields = highlight
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								            matchText, keywords = self.qryr.question(qst, min_match=0.3)
 								            if emb_mdl is None:
 								                matchExprs = [matchText]
-												Refa: asyncio.to_thread to ThreadPoolExecutor to break thread limitat… (#12716)

### Type of change

- [x] Refactoring
											
										
										
											2026-01-20 13:29:37 +08:00
+								                res = await thread_pool_exec(self.dataStore.search, src, highlightFields, filters, matchExprs, orderBy, offset, limit,
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								                                            idx_names, kb_ids, rank_feature=rank_feature)
-												Refactor function name (#11210)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-12 19:00:15 +08:00
+								                total = self.dataStore.get_total(res)
-												Use consistent log file names, introduced initLogger (#3403)

### What problem does this PR solve?

Use consistent log file names, introduced initLogger

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2024-11-14 17:13:48 +08:00
+								                logging.debug("Dealer.search TOTAL: {}".format(total))
-												fix table desc bugs, add positions to chunks (#91)


											
										
										
											2024-03-04 14:42:26 +08:00
+								            else:
-												Refa: async retrieval process. (#12629)

### Type of change

- [x] Refactoring
- [x] Performance Improvement
											
										
										
											2026-01-15 12:28:49 +08:00
+								                matchDense = await self.get_vector(qst, emb_mdl, topk, req.get("similarity", 0.1))
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								                q_vec = matchDense.embedding_data
-												Use Infinity single-field-multi-index (#11444)

### What problem does this PR solve?

Use Infinity single-field-multi-index

### Type of change

- [x] Refactoring
- [x] Performance Improvement
											
										
										
											2025-11-26 11:06:37 +08:00
+								                if not settings.DOC_ENGINE_INFINITY:
 								                    src.append(f"q_{len(q_vec)}_vec")
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
-												Added infinity rank_feature support (#9044)

### What problem does this PR solve?

Added infinity rank_feature support

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-07-29 09:14:23 +08:00
+								                fusionExpr = FusionExpr("weighted_sum", topk, {"weights": "0.05,0.95"})
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								                matchExprs = [matchText, matchDense, fusionExpr]
-												Refa: asyncio.to_thread to ThreadPoolExecutor to break thread limitat… (#12716)

### Type of change

- [x] Refactoring
											
										
										
											2026-01-20 13:29:37 +08:00
+								                res = await thread_pool_exec(self.dataStore.search, src, highlightFields, filters, matchExprs, orderBy, offset, limit,
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								                                            idx_names, kb_ids, rank_feature=rank_feature)
-												Refactor function name (#11210)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-12 19:00:15 +08:00
+								                total = self.dataStore.get_total(res)
-												Use consistent log file names, introduced initLogger (#3403)

### What problem does this PR solve?

Use consistent log file names, introduced initLogger

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2024-11-14 17:13:48 +08:00
+								                logging.debug("Dealer.search TOTAL: {}".format(total))
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
 								                # If result is empty, try again with lower min_match
 								                if total == 0:
-												Fix: uploading in chat box issue. (#6547)

### What problem does this PR solve?

#6228

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-03-26 15:37:48 +08:00
+								                    if filters.get("doc_id"):
-												Refa: asyncio.to_thread to ThreadPoolExecutor to break thread limitat… (#12716)

### Type of change

- [x] Refactoring
											
										
										
											2026-01-20 13:29:37 +08:00
+								                        res = await thread_pool_exec(self.dataStore.search, src, [], filters, [], orderBy, offset, limit, idx_names, kb_ids)
-												Refactor function name (#11210)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-12 19:00:15 +08:00
+								                        total = self.dataStore.get_total(res)
-												Fix: uploading in chat box issue. (#6547)

### What problem does this PR solve?

#6228

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-03-26 15:37:48 +08:00
+								                    else:
 								                        matchText, _ = self.qryr.question(qst, min_match=0.1)
 								                        matchDense.extra_options["similarity"] = 0.17
-												Refa: asyncio.to_thread to ThreadPoolExecutor to break thread limitat… (#12716)

### Type of change

- [x] Refactoring
											
										
										
											2026-01-20 13:29:37 +08:00
+								                        res = await thread_pool_exec(self.dataStore.search, src, highlightFields, filters, [matchText, matchDense, fusionExpr],
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								                                                    orderBy, offset, limit, idx_names, kb_ids,
 								                                                    rank_feature=rank_feature)
-												Refactor function name (#11210)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-12 19:00:15 +08:00
+								                        total = self.dataStore.get_total(res)
-												Refactor:Improve the logic in search.py (#8716)

### What problem does this PR solve?

1. Remove the useless pop logic due to already been checked at the if
logic
2. merge log logic

### Type of change

- [x] Refactoring
											
										
										
											2025-07-08 12:32:01 +08:00
+								                    logging.debug("Dealer.search 2 TOTAL: {}".format(total))
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
 								            for k in keywords:
 								                kwds.add(k)
-												Edit chunk shall update instead of insert it (#3709)

### What problem does this PR solve?

Edit chunk shall update instead of insert it. Close #3679 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-11-28 13:00:38 +08:00
+								                for kk in rag_tokenizer.fine_grained_tokenize(k).split():
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								                    if len(kk) < 2:
 								                        continue
 								                    if kk in kwds:
 								                        continue
 								                    kwds.add(kk)
-												Use consistent log file names, introduced initLogger (#3403)

### What problem does this PR solve?

Use consistent log file names, introduced initLogger

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2024-11-14 17:13:48 +08:00
+								        logging.debug(f"TOTAL: {total}")
-												Feat: message manage (#12196)

### What problem does this PR solve?

Manage message and use in agent.

Issue #4213 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-25 21:18:13 +08:00
+								        ids = self.dataStore.get_doc_ids(res)
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								        keywords = list(kwds)
-												Refactor function name (#11210)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-12 19:00:15 +08:00
+								        highlight = self.dataStore.get_highlight(res, keywords, "content_with_weight")
 								        aggs = self.dataStore.get_aggregation(res, "docnm_kwd")
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
+								        return self.SearchResult(
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								            total=total,
 								            ids=ids,
-												add llm API (#19)

* add llm API

* refine llm API
											
										
										
											2023-12-28 13:50:13 +08:00
+								            query_vector=q_vec,
 								            aggregation=aggs,
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								            highlight=highlight,
-												Refactor function name (#11210)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-12 19:00:15 +08:00
+								            field=self.dataStore.get_fields(res, src + ["_score"]),
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								            keywords=keywords
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
+								        )
 								    @staticmethod
 								    def trans2floats(txt):
-												Fix: float transfer exception. (#6197)

### What problem does this PR solve?

#6177

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-03-18 11:13:44 +08:00
+								        return [get_float(t) for t in txt.split("\t")]
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
-												Test chat API and refine ppt chunker (#42)


											
										
										
											2024-01-23 19:45:36 +08:00
+								    def insert_citations(self, answer, chunks, chunk_v,
-												Add 'One' chunk method (#137)


											
										
										
											2024-03-20 18:57:22 +08:00
+								                         embd_mdl, tkweight=0.1, vtweight=0.9):
-												refine admin initialization (#75)


											
										
										
											2024-02-27 14:57:34 +08:00
+								        assert len(chunks) == len(chunk_v)
-												debug backend API for TAB 'search' (#2389)

### What problem does this PR solve?
#2247

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-09-12 17:51:20 +08:00
+								        if not chunks:
 								            return answer, set([])
-												refine for English corpus (#135)


											
										
										
											2024-03-20 16:56:16 +08:00
+								        pieces = re.split(r"(```)", answer)
 								        if len(pieces) >= 3:
 								            i = 0
 								            pieces_ = []
 								            while i < len(pieces):
 								                if pieces[i] == "```":
 								                    st = i
 								                    i += 1
-												apply pep8 formalize (#155)


											
										
										
											2024-03-27 11:33:46 +08:00
+								                    while i < len(pieces) and pieces[i] != "```":
-												refine for English corpus (#135)


											
										
										
											2024-03-20 16:56:16 +08:00
+								                        i += 1
-												apply pep8 formalize (#155)


											
										
										
											2024-03-27 11:33:46 +08:00
+								                    if i < len(pieces):
 								                        i += 1
 								                    pieces_.append("".join(pieces[st: i]) + "\n")
-												refine for English corpus (#135)


											
										
										
											2024-03-20 16:56:16 +08:00
+								                else:
-												Feature rtl support (#13118)

### What problem does this PR solve?

This PR adds comprehensive **Right-to-Left (RTL) language support**,
primarily targeting Arabic and other RTL scripts (Hebrew, Persian, Urdu,
etc.).

Previously, RTL content had multiple rendering issues:

- Incorrect sentence splitting for Arabic punctuation in citation logic
- Misaligned text in chat messages and markdown components  
- Improper positioning of blockquotes and “think” sections  
- Incorrect table alignment  
- Citation placement ambiguity in RTL prompts  
- UI layout inconsistencies when mixing LTR and RTL text  

This PR introduces backend and frontend improvements to properly detect,
render, and style RTL content while preserving existing LTR behavior.

#### Backend
- Updated sentence boundary regex in `rag/nlp/search.py` to include
Arabic punctuation:
  - `،` (comma)
  - `؛` (semicolon)
  - `؟` (question mark)
  - `۔` (Arabic full stop)
- Ensures citation insertion works correctly in RTL sentences.
- Updated citation prompt instructions to clarify citation placement
rules for RTL languages.

#### Frontend
- Introduced a new utility: `text-direction.ts`
  - Detects text direction based on Unicode ranges.
  - Supports Arabic, Hebrew, Syriac, Thaana, and related scripts.
  - Provides `getDirAttribute()` for automatic `dir` assignment.

- Applied dynamic `dir` attributes across:
  - Markdown rendering
  - Chat messages
  - Search results
  - Tables
  - Hover cards and reference popovers

- Added proper RTL styling in LESS:
  - Text alignment adjustments
  - Blockquote border flipping
  - Section indentation correction
  - Table direction switching
  - Use of `<bdi>` for figure labels to prevent bidirectional conflicts

#### DevOps / Environment
- Added Windows backend launch script with retry handling.
- Updated dependency metadata.
- Adjusted development-only React debugging behavior.

---

### Type of change

- [x] Bug Fix (non-breaking change which fixes RTL rendering and
citation issues)
- [x] New Feature (non-breaking change which adds RTL detection and
dynamic direction handling)

---------

Co-authored-by: 6ba3i <isbaaoui09@gmail.com>
Co-authored-by: Ahmad Intisar <ahmadintisar@Ahmads-MacBook-M4-Pro.local>
Co-authored-by: Ahmad Intisar <168020872+ahmadintisar@users.noreply.github.com>
Co-authored-by: Liu An <asiro@qq.com>
											
										
										
											2026-03-02 08:03:44 +03:00
+								                    # Sentence boundary regex includes Arabic punctuation (، ؛ ؟ ۔)
-												apply pep8 formalize (#155)


											
										
										
											2024-03-27 11:33:46 +08:00
+								                    pieces_.extend(
 								                        re.split(
-												Feature rtl support (#13118)

### What problem does this PR solve?

This PR adds comprehensive **Right-to-Left (RTL) language support**,
primarily targeting Arabic and other RTL scripts (Hebrew, Persian, Urdu,
etc.).

Previously, RTL content had multiple rendering issues:

- Incorrect sentence splitting for Arabic punctuation in citation logic
- Misaligned text in chat messages and markdown components  
- Improper positioning of blockquotes and “think” sections  
- Incorrect table alignment  
- Citation placement ambiguity in RTL prompts  
- UI layout inconsistencies when mixing LTR and RTL text  

This PR introduces backend and frontend improvements to properly detect,
render, and style RTL content while preserving existing LTR behavior.

#### Backend
- Updated sentence boundary regex in `rag/nlp/search.py` to include
Arabic punctuation:
  - `،` (comma)
  - `؛` (semicolon)
  - `؟` (question mark)
  - `۔` (Arabic full stop)
- Ensures citation insertion works correctly in RTL sentences.
- Updated citation prompt instructions to clarify citation placement
rules for RTL languages.

#### Frontend
- Introduced a new utility: `text-direction.ts`
  - Detects text direction based on Unicode ranges.
  - Supports Arabic, Hebrew, Syriac, Thaana, and related scripts.
  - Provides `getDirAttribute()` for automatic `dir` assignment.

- Applied dynamic `dir` attributes across:
  - Markdown rendering
  - Chat messages
  - Search results
  - Tables
  - Hover cards and reference popovers

- Added proper RTL styling in LESS:
  - Text alignment adjustments
  - Blockquote border flipping
  - Section indentation correction
  - Table direction switching
  - Use of `<bdi>` for figure labels to prevent bidirectional conflicts

#### DevOps / Environment
- Added Windows backend launch script with retry handling.
- Updated dependency metadata.
- Adjusted development-only React debugging behavior.

---

### Type of change

- [x] Bug Fix (non-breaking change which fixes RTL rendering and
citation issues)
- [x] New Feature (non-breaking change which adds RTL detection and
dynamic direction handling)

---------

Co-authored-by: 6ba3i <isbaaoui09@gmail.com>
Co-authored-by: Ahmad Intisar <ahmadintisar@Ahmads-MacBook-M4-Pro.local>
Co-authored-by: Ahmad Intisar <168020872+ahmadintisar@users.noreply.github.com>
Co-authored-by: Liu An <asiro@qq.com>
											
										
										
											2026-03-02 08:03:44 +03:00
+								                            r"([^\|][；。？!！،؛؟۔\n]|[a-z\u0600-\u06FF][.?;!،؛؟][ \n])",
-												apply pep8 formalize (#155)


											
										
										
											2024-03-27 11:33:46 +08:00
+								                            pieces[i]))
-												refine for English corpus (#135)


											
										
										
											2024-03-20 16:56:16 +08:00
+								                    i += 1
 								            pieces = pieces_
 								        else:
-												Feature rtl support (#13118)

### What problem does this PR solve?

This PR adds comprehensive **Right-to-Left (RTL) language support**,
primarily targeting Arabic and other RTL scripts (Hebrew, Persian, Urdu,
etc.).

Previously, RTL content had multiple rendering issues:

- Incorrect sentence splitting for Arabic punctuation in citation logic
- Misaligned text in chat messages and markdown components  
- Improper positioning of blockquotes and “think” sections  
- Incorrect table alignment  
- Citation placement ambiguity in RTL prompts  
- UI layout inconsistencies when mixing LTR and RTL text  

This PR introduces backend and frontend improvements to properly detect,
render, and style RTL content while preserving existing LTR behavior.

#### Backend
- Updated sentence boundary regex in `rag/nlp/search.py` to include
Arabic punctuation:
  - `،` (comma)
  - `؛` (semicolon)
  - `؟` (question mark)
  - `۔` (Arabic full stop)
- Ensures citation insertion works correctly in RTL sentences.
- Updated citation prompt instructions to clarify citation placement
rules for RTL languages.

#### Frontend
- Introduced a new utility: `text-direction.ts`
  - Detects text direction based on Unicode ranges.
  - Supports Arabic, Hebrew, Syriac, Thaana, and related scripts.
  - Provides `getDirAttribute()` for automatic `dir` assignment.

- Applied dynamic `dir` attributes across:
  - Markdown rendering
  - Chat messages
  - Search results
  - Tables
  - Hover cards and reference popovers

- Added proper RTL styling in LESS:
  - Text alignment adjustments
  - Blockquote border flipping
  - Section indentation correction
  - Table direction switching
  - Use of `<bdi>` for figure labels to prevent bidirectional conflicts

#### DevOps / Environment
- Added Windows backend launch script with retry handling.
- Updated dependency metadata.
- Adjusted development-only React debugging behavior.

---

### Type of change

- [x] Bug Fix (non-breaking change which fixes RTL rendering and
citation issues)
- [x] New Feature (non-breaking change which adds RTL detection and
dynamic direction handling)

---------

Co-authored-by: 6ba3i <isbaaoui09@gmail.com>
Co-authored-by: Ahmad Intisar <ahmadintisar@Ahmads-MacBook-M4-Pro.local>
Co-authored-by: Ahmad Intisar <168020872+ahmadintisar@users.noreply.github.com>
Co-authored-by: Liu An <asiro@qq.com>
											
										
										
											2026-03-02 08:03:44 +03:00
+								            # Sentence boundary regex includes Arabic punctuation (، ؛ ؟ ۔)
 								            pieces = re.split(r"([^\|][；。？!！،؛؟۔\n]|[a-z\u0600-\u06FF][.?;!،؛؟][ \n])", answer)
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								        for i in range(1, len(pieces)):
-												Feature rtl support (#13118)

### What problem does this PR solve?

This PR adds comprehensive **Right-to-Left (RTL) language support**,
primarily targeting Arabic and other RTL scripts (Hebrew, Persian, Urdu,
etc.).

Previously, RTL content had multiple rendering issues:

- Incorrect sentence splitting for Arabic punctuation in citation logic
- Misaligned text in chat messages and markdown components  
- Improper positioning of blockquotes and “think” sections  
- Incorrect table alignment  
- Citation placement ambiguity in RTL prompts  
- UI layout inconsistencies when mixing LTR and RTL text  

This PR introduces backend and frontend improvements to properly detect,
render, and style RTL content while preserving existing LTR behavior.

#### Backend
- Updated sentence boundary regex in `rag/nlp/search.py` to include
Arabic punctuation:
  - `،` (comma)
  - `؛` (semicolon)
  - `؟` (question mark)
  - `۔` (Arabic full stop)
- Ensures citation insertion works correctly in RTL sentences.
- Updated citation prompt instructions to clarify citation placement
rules for RTL languages.

#### Frontend
- Introduced a new utility: `text-direction.ts`
  - Detects text direction based on Unicode ranges.
  - Supports Arabic, Hebrew, Syriac, Thaana, and related scripts.
  - Provides `getDirAttribute()` for automatic `dir` assignment.

- Applied dynamic `dir` attributes across:
  - Markdown rendering
  - Chat messages
  - Search results
  - Tables
  - Hover cards and reference popovers

- Added proper RTL styling in LESS:
  - Text alignment adjustments
  - Blockquote border flipping
  - Section indentation correction
  - Table direction switching
  - Use of `<bdi>` for figure labels to prevent bidirectional conflicts

#### DevOps / Environment
- Added Windows backend launch script with retry handling.
- Updated dependency metadata.
- Adjusted development-only React debugging behavior.

---

### Type of change

- [x] Bug Fix (non-breaking change which fixes RTL rendering and
citation issues)
- [x] New Feature (non-breaking change which adds RTL detection and
dynamic direction handling)

---------

Co-authored-by: 6ba3i <isbaaoui09@gmail.com>
Co-authored-by: Ahmad Intisar <ahmadintisar@Ahmads-MacBook-M4-Pro.local>
Co-authored-by: Ahmad Intisar <168020872+ahmadintisar@users.noreply.github.com>
Co-authored-by: Liu An <asiro@qq.com>
											
										
										
											2026-03-02 08:03:44 +03:00
+								            if re.match(r"([^\|][；。？!！،؛؟۔\n]|[a-z\u0600-\u06FF][.?;!،؛؟][ \n])", pieces[i]):
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								                pieces[i - 1] += pieces[i][0]
 								                pieces[i] = pieces[i][1:]
 								        idx = []
 								        pieces_ = []
 								        for i, t in enumerate(pieces):
-												Test chat API and refine ppt chunker (#42)


											
										
										
											2024-01-23 19:45:36 +08:00
+								            if len(t) < 5:
 								                continue
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								            idx.append(i)
 								            pieces_.append(t)
-												Use consistent log file names, introduced initLogger (#3403)

### What problem does this PR solve?

Use consistent log file names, introduced initLogger

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2024-11-14 17:13:48 +08:00
+								        logging.debug("{} => {}".format(answer, pieces_))
-												Test chat API and refine ppt chunker (#42)


											
										
										
											2024-01-23 19:45:36 +08:00
+								        if not pieces_:
-												fix gb2312 encoding issue (#394)

### What problem does this PR solve?

Issue link:#384
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-04-16 19:45:14 +08:00
+								            return answer, set([])
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
-												Test chat API and refine ppt chunker (#42)


											
										
										
											2024-01-23 19:45:36 +08:00
+								        ans_v, _ = embd_mdl.encode(pieces_)
-												Light GraphRAG (#4585)

### What problem does this PR solve?

#4543

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-22 19:43:14 +08:00
+								        for i in range(len(chunk_v)):
 								            if len(ans_v[0]) != len(chunk_v[i]):
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								                chunk_v[i] = [0.0] * len(ans_v[0])
 								                logging.warning(
 								                    "The dimension of query and chunk do not match: {} vs. {}".format(len(ans_v[0]), len(chunk_v[i])))
-												Light GraphRAG (#4585)

### What problem does this PR solve?

#4543

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-22 19:43:14 +08:00
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								        assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								            len(ans_v[0]), len(chunk_v[0]))
-												add llm API (#19)

* add llm API

* refine llm API
											
										
										
											2023-12-28 13:50:13 +08:00
-												Edit chunk shall update instead of insert it (#3709)

### What problem does this PR solve?

Edit chunk shall update instead of insert it. Close #3679 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-11-28 13:00:38 +08:00
+								        chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split()
-												apply pep8 formalize (#155)


											
										
										
											2024-03-27 11:33:46 +08:00
+								                      for ck in chunks]
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								        cites = {}
-												refine citation (#161)


											
										
										
											2024-03-28 11:45:50 +08:00
+								        thr = 0.63
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								        while thr > 0.3 and len(cites.keys()) == 0 and pieces_ and chunks_tks:
-												refine citation (#161)


											
										
										
											2024-03-28 11:45:50 +08:00
+								            for i, a in enumerate(pieces_):
 								                sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
 								                                                                chunk_v,
-												refine code (#595)

### What problem does this PR solve?

### Type of change

- [x] Refactoring
											
										
										
											2024-04-28 19:13:33 +08:00
+								                                                                rag_tokenizer.tokenize(
-												Edit chunk shall update instead of insert it (#3709)

### What problem does this PR solve?

Edit chunk shall update instead of insert it. Close #3679 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-11-28 13:00:38 +08:00
+								                                                                    self.qryr.rmWWW(pieces_[i])).split(),
-												refine citation (#161)


											
										
										
											2024-03-28 11:45:50 +08:00
+								                                                                chunks_tks,
 								                                                                tkweight, vtweight)
 								                mx = np.max(sim) * 0.99
-												Use consistent log file names, introduced initLogger (#3403)

### What problem does this PR solve?

Use consistent log file names, introduced initLogger

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [x] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
											
										
										
											2024-11-14 17:13:48 +08:00
+								                logging.debug("{} SIM: {}".format(pieces_[i], mx))
-												refine citation (#161)


											
										
										
											2024-03-28 11:45:50 +08:00
+								                if mx < thr:
 								                    continue
 								                cites[idx[i]] = list(
 								                    set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4]
 								            thr *= 0.8
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								        res = ""
-												deal with stop reason being length problem (#109)


											
										
										
											2024-03-07 16:12:01 +08:00
+								        seted = set([])
-												Test chat API and refine ppt chunker (#42)


											
										
										
											2024-01-23 19:45:36 +08:00
+								        for i, p in enumerate(pieces):
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								            res += p
-												Test chat API and refine ppt chunker (#42)


											
										
										
											2024-01-23 19:45:36 +08:00
+								            if i not in idx:
 								                continue
 								            if i not in cites:
 								                continue
-												deal with stop reason being length problem (#109)


											
										
										
											2024-03-07 16:12:01 +08:00
+								            for c in cites[i]:
-												apply pep8 formalize (#155)


											
										
										
											2024-03-27 11:33:46 +08:00
+								                assert int(c) < len(chunk_v)
 								            for c in cites[i]:
 								                if c in seted:
 								                    continue
-												Refa: change citation mark as [ID:n] (#7923)

### What problem does this PR solve?

Change citation mark as [ID:n], it's easier for LLMs to follow the
instruction :) #7904

### Type of change

- [x] Refactoring
											
										
										
											2025-05-29 10:03:51 +08:00
+								                res += f" [ID:{c}]"
-												deal with stop reason being length problem (#109)


											
										
										
											2024-03-07 16:12:01 +08:00
+								                seted.add(c)
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
-												refine presentation parser (#110)


											
										
										
											2024-03-07 17:21:38 +08:00
+								        return res, seted
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								    def _rank_feature_scores(self, query_rfea, search_res):
 								        ## For rank feature(tag_fea) scores.
 								        rank_fea = []
 								        pageranks = []
 								        for chunk_id in search_res.ids:
 								            pageranks.append(search_res.field[chunk_id].get(PAGERANK_FLD, 0))
 								        pageranks = np.array(pageranks, dtype=float)
 								        if not query_rfea:
 								            return np.array([0 for _ in range(len(search_res.ids))]) + pageranks
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								        q_denor = np.sqrt(np.sum([s * s for t, s in query_rfea.items() if t != PAGERANK_FLD]))
-												Fix tag_feas code injection in retrieval ranking (#13923)

## Summary
- remove eval-based parsing from retrieval rank feature scoring
- validate `tag_feas` at write time in chunk APIs and SDK routes
- add regression tests for safe parsing and malicious payload rejection

## Details
`tag_feas` is intended to be structured rank-feature data, but the
retrieval ranking path was evaluating stored values as Python
expressions. This change treats `tag_feas` strictly as data.

### What changed
- replace `eval()` in `rag/nlp/search.py` with safe parsing via
`json.loads()` and optional `ast.literal_eval()` compatibility for
legacy Python-dict strings
- strictly filter parsed values down to `dict[str, finite number]`
- reject invalid `tag_feas` payloads at write time in web chunk routes
and SDK document chunk routes
- add focused regression tests to prove executable strings are ignored
and invalid payloads are rejected

## Validation
- `python -m pytest test/unit_test/common/test_tag_feature_utils.py
test/unit_test/rag/test_rank_feature_scores.py -q`

---------

Co-authored-by: unknown <zhenglinkai@CCN.Local>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
											
										
										
											2026-04-15 16:31:11 +08:00
+								        if q_denor == 0:
 								            return np.array([0 for _ in range(len(search_res.ids))]) + pageranks
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								        for i in search_res.ids:
 								            nor, denor = 0, 0
-												Fix: empty tag field issue. (#6103)

### What problem does this PR solve?

#6102

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-03-14 17:35:57 +08:00
+								            if not search_res.field[i].get(TAG_FLD):
-												Fix: rank feat issue. (#6225)

### What problem does this PR solve?


### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-03-18 16:07:29 +08:00
+								                rank_fea.append(0)
-												Fix: empty tag field issue. (#6103)

### What problem does this PR solve?

#6102

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-03-14 17:35:57 +08:00
+								                continue
-												Fix tag_feas code injection in retrieval ranking (#13923)

## Summary
- remove eval-based parsing from retrieval rank feature scoring
- validate `tag_feas` at write time in chunk APIs and SDK routes
- add regression tests for safe parsing and malicious payload rejection

## Details
`tag_feas` is intended to be structured rank-feature data, but the
retrieval ranking path was evaluating stored values as Python
expressions. This change treats `tag_feas` strictly as data.

### What changed
- replace `eval()` in `rag/nlp/search.py` with safe parsing via
`json.loads()` and optional `ast.literal_eval()` compatibility for
legacy Python-dict strings
- strictly filter parsed values down to `dict[str, finite number]`
- reject invalid `tag_feas` payloads at write time in web chunk routes
and SDK document chunk routes
- add focused regression tests to prove executable strings are ignored
and invalid payloads are rejected

## Validation
- `python -m pytest test/unit_test/common/test_tag_feature_utils.py
test/unit_test/rag/test_rank_feature_scores.py -q`

---------

Co-authored-by: unknown <zhenglinkai@CCN.Local>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
											
										
										
											2026-04-15 16:31:11 +08:00
+								            tag_feas = parse_tag_features(search_res.field[i].get(TAG_FLD), allow_json_string=True, allow_python_literal=True)
 								            if not tag_feas:
 								                rank_fea.append(0)
 								                continue
 								            for t, sc in tag_feas.items():
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								                if t in query_rfea:
 								                    nor += query_rfea[t] * sc
 								                denor += sc * sc
 								            if denor == 0:
 								                rank_fea.append(0)
 								            else:
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								                rank_fea.append(nor / np.sqrt(denor) / q_denor)
 								        return np.array(rank_fea) * 10. + pageranks
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
-												Test chat API and refine ppt chunker (#42)


											
										
										
											2024-01-23 19:45:36 +08:00
+								    def rerank(self, sres, query, tkweight=0.3,
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								               vtweight=0.7, cfield="content_ltks",
 								               rank_feature: dict | None = None
 								               ):
-												refine for English corpus (#135)


											
										
										
											2024-03-20 16:56:16 +08:00
+								        _, keywords = self.qryr.question(query)
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								        vector_size = len(sres.query_vector)
 								        vector_column = f"q_{vector_size}_vec"
 								        zero_vector = [0.0] * vector_size
 								        ins_embd = []
 								        for chunk_id in sres.ids:
 								            vector = sres.field[chunk_id].get(vector_column, zero_vector)
 								            if isinstance(vector, str):
-												Fix: float transfer exception. (#6197)

### What problem does this PR solve?

#6177

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-03-18 11:13:44 +08:00
+								                vector = [get_float(v) for v in vector.split("\t")]
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								            ins_embd.append(vector)
-												add llm API (#19)

* add llm API

* refine llm API
											
										
										
											2023-12-28 13:50:13 +08:00
+								        if not ins_embd:
-												Test APIs and fix bugs (#41)


											
										
										
											2024-01-22 19:51:38 +08:00
+								            return [], [], []
-												Fit a lot of encodings for text file. (#458)

### What problem does this PR solve?

#384

### Type of change

- [x] Performance Improvement
											
										
										
											2024-04-19 18:02:53 +08:00
 								        for i in sres.ids:
 								            if isinstance(sres.field[i].get("important_kwd", []), str):
 								                sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
 								        ins_tw = []
 								        for i in sres.ids:
-												Refa: token similarity calculations. (#6614)

### What problem does this PR solve?

#6507

### Type of change

- [x] Performance Improvement
											
										
										
											2025-03-28 09:33:08 +08:00
+								            content_ltks = list(OrderedDict.fromkeys(sres.field[i][cfield].split()))
-												Edit chunk shall update instead of insert it (#3709)

### What problem does this PR solve?

Edit chunk shall update instead of insert it. Close #3679 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-11-28 13:00:38 +08:00
+								            title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
-												Feat: Add question parameter to edit chunk modal (#3875)

### What problem does this PR solve?

Close #3873

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-05 14:51:19 +08:00
+								            question_tks = [t for t in sres.field[i].get("question_tks", "").split() if t]
-												Fit a lot of encodings for text file. (#458)

### What problem does this PR solve?

#384

### Type of change

- [x] Performance Improvement
											
										
										
											2024-04-19 18:02:53 +08:00
+								            important_kwd = sres.field[i].get("important_kwd", [])
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								            tks = content_ltks + title_tks * 2 + important_kwd * 5 + question_tks * 6
-												Fit a lot of encodings for text file. (#458)

### What problem does this PR solve?

#384

### Type of change

- [x] Performance Improvement
											
										
										
											2024-04-19 18:02:53 +08:00
+								            ins_tw.append(tks)
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								        ## For rank feature(tag_fea) scores.
 								        rank_fea = self._rank_feature_scores(rank_feature, sres)
-												add dialog api (#33)


											
										
										
											2024-01-17 20:20:42 +08:00
+								        sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								                                                        ins_embd,
-												refine for English corpus (#135)


											
										
										
											2024-03-20 16:56:16 +08:00
+								                                                        keywords,
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								                                                        ins_tw, tkweight, vtweight)
-												Add pagerank to KB. (#3809)

### What problem does this PR solve?

#3794

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-03 14:30:35 +08:00
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								        return sim + rank_fea, tksim, vtsim
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
-												add rerank model (#969)

### What problem does this PR solve?

feat: add rerank models to the project #724 #162

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-05-29 16:50:02 +08:00
+								    def rerank_by_model(self, rerank_mdl, sres, query, tkweight=0.3,
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								                        vtweight=0.7, cfield="content_ltks",
 								                        rank_feature: dict | None = None):
-												add rerank model (#969)

### What problem does this PR solve?

feat: add rerank models to the project #724 #162

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-05-29 16:50:02 +08:00
+								        _, keywords = self.qryr.question(query)
 								        for i in sres.ids:
 								            if isinstance(sres.field[i].get("important_kwd", []), str):
 								                sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
 								        ins_tw = []
 								        for i in sres.ids:
-												Edit chunk shall update instead of insert it (#3709)

### What problem does this PR solve?

Edit chunk shall update instead of insert it. Close #3679 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-11-28 13:00:38 +08:00
+								            content_ltks = sres.field[i][cfield].split()
 								            title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
-												add rerank model (#969)

### What problem does this PR solve?

feat: add rerank models to the project #724 #162

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-05-29 16:50:02 +08:00
+								            important_kwd = sres.field[i].get("important_kwd", [])
 								            tks = content_ltks + title_tks + important_kwd
 								            ins_tw.append(tks)
 								        tksim = self.qryr.token_similarity(keywords, ins_tw)
-												Refactor: rename rmSpace to remove_redundant_spaces (#10796)

### What problem does this PR solve?

- rename rmSpace to remove_redundant_spaces
- move clean_markdown_block to common module
- add unit tests for remove_redundant_spaces and clean_markdown_block

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-10-28 09:46:32 +08:00
+								        vtsim, _ = rerank_mdl.similarity(query, [remove_redundant_spaces(" ".join(tks)) for tks in ins_tw])
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								        ## For rank feature(tag_fea) scores.
 								        rank_fea = self._rank_feature_scores(rank_feature, sres)
-												add rerank model (#969)

### What problem does this PR solve?

feat: add rerank models to the project #724 #162

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-05-29 16:50:02 +08:00
-												Fix: rerank algorithm (#11266)

### What problem does this PR solve?

Fix: rerank algorithm #11234

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-14 13:59:54 +08:00
+								        return tkweight * np.array(tksim) + vtweight * vtsim + rank_fea, tksim, vtsim
-												add rerank model (#969)

### What problem does this PR solve?

feat: add rerank models to the project #724 #162

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-05-29 16:50:02 +08:00
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								    def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
 								        return self.qryr.hybrid_similarity(ans_embd,
 								                                           ins_embd,
-												Edit chunk shall update instead of insert it (#3709)

### What problem does this PR solve?

Edit chunk shall update instead of insert it. Close #3679 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2024-11-28 13:00:38 +08:00
+								                                           rag_tokenizer.tokenize(ans).split(),
 								                                           rag_tokenizer.tokenize(inst).split())
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
-												Refa: async retrieval process. (#12629)

### Type of change

- [x] Refactoring
- [x] Performance Improvement
											
										
										
											2026-01-15 12:28:49 +08:00
+								    async def retrieval(
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								            self,
 								            question,
 								            embd_mdl,
 								            tenant_ids,
 								            kb_ids,
 								            page,
 								            page_size,
 								            similarity_threshold=0.2,
 								            vector_similarity_weight=0.3,
 								            top=1024,
 								            doc_ids=None,
 								            aggs=True,
 								            rerank_mdl=None,
 								            highlight=False,
 								            rank_feature: dict | None = {PAGERANK_FLD: 10},
-												Fix: incorrect retrieval total count with pagination enabled (#11400)

### What problem does this PR solve?

Incorrect retrieval total count with pagination enabled.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-20 15:35:09 +08:00
+								    ):
-												Test APIs and fix bugs (#41)


											
										
										
											2024-01-22 19:51:38 +08:00
+								        ranks = {"total": 0, "chunks": [], "doc_aggs": {}}
-												Test chat API and refine ppt chunker (#42)


											
										
										
											2024-01-23 19:45:36 +08:00
+								        if not question:
 								            return ranks
-												search between multiple indiices for team function (#3079)

### What problem does this PR solve?

#2834 
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-10-29 13:19:01 +08:00
-												Fix: rerank overflow by enforcing top_k and 64 cap (#14084)

### What problem does this PR solve?

This fixes rerank overflow where retrieval could send more documents
than allowed (for example 66 when `page_size=6`), causing provider 400
errors and bypassing the user’s `top_k` intent in rerank-enabled paths.
this pr fixes #14081

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-04-14 10:47:25 +08:00
+								        # Keep the historical windowing strategy by default, but when an external
 								        # reranker is enabled cap candidate count by both top_k and provider-safe 64.
-												Fix: incorrect retrieval total count with pagination enabled (#11400)

### What problem does this PR solve?

Incorrect retrieval total count with pagination enabled.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-20 15:35:09 +08:00
+								        RERANK_LIMIT = math.ceil(64 / page_size) * page_size if page_size > 1 else 1
-												Feat: support tree structured deep-research policy. (#12559)

### What problem does this PR solve?

#12558
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2026-01-13 09:41:35 +08:00
+								        RERANK_LIMIT = max(30, RERANK_LIMIT)
-												Fix: rerank overflow by enforcing top_k and 64 cap (#14084)

### What problem does this PR solve?

This fixes rerank overflow where retrieval could send more documents
than allowed (for example 66 when `page_size=6`), causing provider 400
errors and bypassing the user’s `top_k` intent in rerank-enabled paths.
this pr fixes #14081

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-04-14 10:47:25 +08:00
+								        if rerank_mdl and top > 0:
 								            RERANK_LIMIT = min(RERANK_LIMIT, top, 64)
 								        page = max(page, 1)
 								        global_offset = (page - 1) * page_size
-												Fix: incorrect retrieval total count with pagination enabled (#11400)

### What problem does this PR solve?

Incorrect retrieval total count with pagination enabled.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-20 15:35:09 +08:00
+								        req = {
 								            "kb_ids": kb_ids,
 								            "doc_ids": doc_ids,
-												Fix: rerank overflow by enforcing top_k and 64 cap (#14084)

### What problem does this PR solve?

This fixes rerank overflow where retrieval could send more documents
than allowed (for example 66 when `page_size=6`), causing provider 400
errors and bypassing the user’s `top_k` intent in rerank-enabled paths.
this pr fixes #14081

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-04-14 10:47:25 +08:00
+								            "page": global_offset // RERANK_LIMIT + 1,
-												Fix: incorrect retrieval total count with pagination enabled (#11400)

### What problem does this PR solve?

Incorrect retrieval total count with pagination enabled.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-20 15:35:09 +08:00
+								            "size": RERANK_LIMIT,
 								            "question": question,
 								            "vector": True,
 								            "topk": top,
 								            "similarity": similarity_threshold,
 								            "available_int": 1,
 								        }
-												Fix: fix retrieval tesing wrong pagination (#7174)

### What problem does this PR solve?

Fix retrieval testing wrong pagination. #7171 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
											
										
										
											2025-04-22 15:16:04 +08:00
-												search between multiple indiices for team function (#3079)

### What problem does this PR solve?

#2834 
### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-10-29 13:19:01 +08:00
+								        if isinstance(tenant_ids, str):
 								            tenant_ids = tenant_ids.split(",")
-												Refa: async retrieval process. (#12629)

### Type of change

- [x] Refactoring
- [x] Performance Improvement
											
										
										
											2026-01-15 12:28:49 +08:00
+								        sres = await self.search(req, [index_name(tid) for tid in tenant_ids], kb_ids, embd_mdl, highlight,
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								                           rank_feature=rank_feature)
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
-												Refa: fix re-rank scope. (#6152)

### What problem does this PR solve?

#6140

### Type of change


- [x] Refactoring
											
										
										
											2025-03-17 13:26:29 +08:00
+								        if rerank_mdl and sres.total > 0:
-												Fix: incorrect retrieval total count with pagination enabled (#11400)

### What problem does this PR solve?

Incorrect retrieval total count with pagination enabled.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-20 15:35:09 +08:00
+								            sim, tsim, vsim = self.rerank_by_model(
 								                rerank_mdl,
 								                sres,
 								                question,
 - vector_similarity_weight,
 								                vector_similarity_weight,
 								                rank_feature=rank_feature,
 								            )
-												add rerank model (#969)

### What problem does this PR solve?

feat: add rerank models to the project #724 #162

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-05-29 16:50:02 +08:00
+								        else:
-												Use Infinity single-field-multi-index (#11444)

### What problem does this PR solve?

Use Infinity single-field-multi-index

### Type of change

- [x] Refactoring
- [x] Performance Improvement
											
										
										
											2025-11-26 11:06:37 +08:00
+								            if settings.DOC_ENGINE_INFINITY:
 								                # Don't need rerank here since Infinity normalizes each way score before fusion.
 								                sim = [sres.field[id].get("_score", 0.0) for id in sres.ids]
 								                sim = [s if s is not None else 0.0 for s in sim]
 								                tsim = sim
 								                vsim = sim
 								            else:
-												Don't rerank for infinity (#10579)

### What problem does this PR solve?

Don't need rerank for infinity since Infinity normalizes each way score
before fusion.

### Type of change

- [x] Refactoring
											
										
										
											2025-10-15 20:15:49 +08:00
+								                # ElasticSearch doesn't normalize each way score before fusion.
 								                sim, tsim, vsim = self.rerank(
-												Fix: incorrect retrieval total count with pagination enabled (#11400)

### What problem does this PR solve?

Incorrect retrieval total count with pagination enabled.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-20 15:35:09 +08:00
+								                    sres,
 								                    question,
 - vector_similarity_weight,
 								                    vector_similarity_weight,
 								                    rank_feature=rank_feature,
 								                )
-												Fix: opensearch retrieval error (#10891)

### What problem does this PR solve?

Fix: opensearch retrieval error #10828

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-10-30 17:30:54 +08:00
+								        sim_np = np.array(sim, dtype=np.float64)
-												Fix: incorrect retrieval total count with pagination enabled (#11400)

### What problem does this PR solve?

Incorrect retrieval total count with pagination enabled.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-20 15:35:09 +08:00
+								        if sim_np.size == 0:
-												Fix: doc_aggs not correctly returned when no chunks retrieved. (#11578)

### What problem does this PR solve?

Fix: doc_aggs not correctly returned when no chunks retrieved.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-28 13:09:05 +08:00
+								            ranks["doc_aggs"] = []
-												Fix: incorrect retrieval total count with pagination enabled (#11400)

### What problem does this PR solve?

Incorrect retrieval total count with pagination enabled.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-20 15:35:09 +08:00
+								            return ranks
 								        sorted_idx = np.argsort(sim_np * -1)
-												Fix: failing p3 test for SDK/HTTP APIs (#13062)

### What problem does this PR solve?

Adjust highlight parsing, add row-count SQL override, tweak retrieval
thresholding, and update tests with engine-aware skips/utilities.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-02-09 14:56:10 +08:00
+								        # When vector_similarity_weight is 0, similarity_threshold is not meaningful for term-only scores.
 								        post_threshold = 0.0 if vector_similarity_weight <= 0 else similarity_threshold
-												Fix retrieval function when metadata_condtion is specified in retrieval API (#13473)

### What problem does this PR solve?

Fix https://github.com/infiniflow/ragflow/issues/13388

The following command returns empty when there is doc with the meta data
```
curl --request POST \
     --url http://localhost:9222/api/v1/retrieval \
     --header 'Content-Type: application/json' \
     --header 'Authorization: Bearer ragflow-fO3mPFePfLgUYg8-9gjBVVXbvHqrvMPLGaW0P86PvAk' \
     --data '{
          "question": "any question",
          "dataset_ids": ["9bb4f0591b8811f18a4a84ba59049aa3"],
           "metadata_condition": {
            "logic": "and",
            "conditions": [
              {
                "name": "character",
                "comparison_operator": "is",
                "value": "刘备"
              }
            ]
          }
     }'
```

When metadata_condtion is specified in the retrieval API, it is
converted to doc_ids and doc_ids is passed to retrieval function.
In retrieval funciton, when doc_ids is explicitly provided , we should
bypass threshold.


### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-03-10 11:57:32 +08:00
 								        # When doc_ids is explicitly provided (metadata or document filtering), bypass threshold
 								        # User wants those specific documents regardless of their relevance score
 								        if doc_ids:
 								            post_threshold = 0.0
-												Fix: failing p3 test for SDK/HTTP APIs (#13062)

### What problem does this PR solve?

Adjust highlight parsing, add row-count SQL override, tweak retrieval
thresholding, and update tests with engine-aware skips/utilities.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-02-09 14:56:10 +08:00
+								        valid_idx = [int(i) for i in sorted_idx if sim_np[i] >= post_threshold]
-												Fix: incorrect retrieval total count with pagination enabled (#11400)

### What problem does this PR solve?

Incorrect retrieval total count with pagination enabled.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-20 15:35:09 +08:00
+								        filtered_count = len(valid_idx)
 								        ranks["total"] = int(filtered_count)
 								        if filtered_count == 0:
-												Fix: doc_aggs not correctly returned when no chunks retrieved. (#11578)

### What problem does this PR solve?

Fix: doc_aggs not correctly returned when no chunks retrieved.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-28 13:09:05 +08:00
+								            ranks["doc_aggs"] = []
-												Fix: incorrect retrieval total count with pagination enabled (#11400)

### What problem does this PR solve?

Incorrect retrieval total count with pagination enabled.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-20 15:35:09 +08:00
+								            return ranks
-												Fix: rerank overflow by enforcing top_k and 64 cap (#14084)

### What problem does this PR solve?

This fixes rerank overflow where retrieval could send more documents
than allowed (for example 66 when `page_size=6`), causing provider 400
errors and bypassing the user’s `top_k` intent in rerank-enabled paths.
this pr fixes #14081

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-04-14 10:47:25 +08:00
+								        begin = global_offset % RERANK_LIMIT
-												Fix: incorrect retrieval total count with pagination enabled (#11400)

### What problem does this PR solve?

Incorrect retrieval total count with pagination enabled.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-20 15:35:09 +08:00
+								        end = begin + page_size
 								        page_idx = valid_idx[begin:end]
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								        dim = len(sres.query_vector)
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								        vector_column = f"q_{dim}_vec"
 								        zero_vector = [0.0] * dim
-												Fix: doc_aggs issue. (#8418)

### What problem does this PR solve?

#8406

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-06-23 14:54:01 +08:00
-												Fix: incorrect retrieval total count with pagination enabled (#11400)

### What problem does this PR solve?

Incorrect retrieval total count with pagination enabled.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-20 15:35:09 +08:00
+								        for i in page_idx:
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								            id = sres.ids[i]
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								            chunk = sres.field[id]
-												Robust for abnormal response from LLMs. (#4747)

### What problem does this PR solve?


### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-02-06 17:34:53 +08:00
+								            dnm = chunk.get("docnm_kwd", "")
 								            did = chunk.get("doc_id", "")
-												Fix: doc_aggs issue. (#8418)

### What problem does this PR solve?

#8406

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-06-23 14:54:01 +08:00
-												Rename page_num_list, top_list, position_list (#3940)

### What problem does this PR solve?

Rename page_num_list, top_list, position_list to page_num_int, top_int,
position_int

### Type of change

- [x] Refactoring
											
										
										
											2024-12-10 16:32:58 +08:00
+								            position_int = chunk.get("position_int", [])
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								            d = {
 								                "chunk_id": id,
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								                "content_ltks": chunk["content_ltks"],
 								                "content_with_weight": chunk["content_with_weight"],
-												Robust for abnormal response from LLMs. (#4747)

### What problem does this PR solve?


### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-02-06 17:34:53 +08:00
+								                "doc_id": did,
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								                "docnm_kwd": dnm,
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								                "kb_id": chunk["kb_id"],
 								                "important_kwd": chunk.get("important_kwd", []),
-												feat: support reading tags via API (#12891) (#13732)

### What problem does this PR solve?

Enable reading Tag Set tags via API (expose tag_kwd field). The result
of the queried list chunks is as shown below:

<img width="1422" height="818" alt="image"
src="https://github.com/user-attachments/assets/abd1960a-fe34-489e-9d72-525f8e574938"
/>


### Type of change

- [x] New Feature (non-breaking change which adds functionality)

Co-authored-by: heyang.why <heyang.why@alibaba-inc.com>
											
										
										
											2026-03-29 20:17:01 +08:00
+								                "tag_kwd": chunk.get("tag_kwd", []),
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								                "image_id": chunk.get("img_id", ""),
-												Fix: incorrect retrieval total count with pagination enabled (#11400)

### What problem does this PR solve?

Incorrect retrieval total count with pagination enabled.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-20 15:35:09 +08:00
+								                "similarity": float(sim_np[i]),
 								                "vector_similarity": float(vsim[i]),
 								                "term_similarity": float(tsim[i]),
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								                "vector": chunk.get(vector_column, zero_vector),
-												Rename page_num_list, top_list, position_list (#3940)

### What problem does this PR solve?

Rename page_num_list, top_list, position_list to page_num_int, top_int,
position_int

### Type of change

- [x] Refactoring
											
										
										
											2024-12-10 16:32:58 +08:00
+								                "positions": position_int,
-												Fix: incorrect retrieval total count with pagination enabled (#11400)

### What problem does this PR solve?

Incorrect retrieval total count with pagination enabled.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-20 15:35:09 +08:00
+								                "doc_type_kwd": chunk.get("doc_type_kwd", ""),
-												Fix: parent-child chunking method (#11810)

### What problem does this PR solve?

change:
parent-child chunking method

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-09 09:34:01 +08:00
+								                "mom_id": chunk.get("mom_id", ""),
-												Get ROW_ID from search() in Infinity (#13901)

### What problem does this PR solve?

1. Search() in Infinity can return row_id now

2.  To Get ROW_ID from search(), refer to handling of retrieval_test.

example
```
$ curl -s -X POST "http://localhost:$PORT/v1/chunk/retrieval_test" -H "Authorization: $TOKEN" -H "Content-Type: application/json" -d '{"kb_id": "4fcd01582ca911f1954184ba59049aa3", "question": "曹操"}'
```


### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-04-02 18:56:43 +08:00
+								                "row_id": chunk.get("row_id()"),
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								            }
-												Add pagerank to KB. (#3809)

### What problem does this PR solve?

#3794

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-12-03 14:30:35 +08:00
+								            if highlight and sres.highlight:
-												debug backend API for TAB 'search' (#2389)

### What problem does this PR solve?
#2247

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-09-12 17:51:20 +08:00
+								                if id in sres.highlight:
-												Refactor: rename rmSpace to remove_redundant_spaces (#10796)

### What problem does this PR solve?

- rename rmSpace to remove_redundant_spaces
- move clean_markdown_block to common module
- add unit tests for remove_redundant_spaces and clean_markdown_block

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-10-28 09:46:32 +08:00
+								                    d["highlight"] = remove_redundant_spaces(sres.highlight[id])
-												debug backend API for TAB 'search' (#2389)

### What problem does this PR solve?
#2247

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2024-09-12 17:51:20 +08:00
+								                else:
 								                    d["highlight"] = d["content_with_weight"]
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								            ranks["chunks"].append(d)
-												Fix: incorrect retrieval total count with pagination enabled (#11400)

### What problem does this PR solve?

Incorrect retrieval total count with pagination enabled.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-20 15:35:09 +08:00
 								        if aggs:
 								            for i in valid_idx:
 								                id = sres.ids[i]
 								                chunk = sres.field[id]
 								                dnm = chunk.get("docnm_kwd", "")
 								                did = chunk.get("doc_id", "")
 								                if dnm not in ranks["doc_aggs"]:
 								                    ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
 								                ranks["doc_aggs"][dnm]["count"] += 1
 								            ranks["doc_aggs"] = [
 								                {
 								                    "doc_name": k,
 								                    "doc_id": v["doc_id"],
 								                    "count": v["count"],
 								                }
 								                for k, v in sorted(
 								                    ranks["doc_aggs"].items(),
 								                    key=lambda x: x[1]["count"] * -1,
 								                )
 								            ]
 								        else:
 								            ranks["doc_aggs"] = []
-												change licence (#28)

* add front end code

* change licence
											
										
										
											2024-01-17 09:39:50 +08:00
-												add conversation API (#35)


											
										
										
											2024-01-18 19:28:37 +08:00
+								        return ranks
-												Add resume parser and fix bugs (#59)

* Update .gitignore

* Update .gitignore

* Add resume parser and fix bugs
											
										
										
											2024-02-07 19:27:23 +08:00
-												Refine resume parts and fix bugs in retrival using sql (#66)


											
										
										
											2024-02-19 19:22:17 +08:00
+								    def sql_retrieval(self, sql, fetch_size=128, format="json"):
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								        tbl = self.dataStore.sql(sql, fetch_size, format)
 								        return tbl
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								    def chunk_list(self, doc_id: str, tenant_id: str,
 								                   kb_ids: list[str], max_count=1024,
 								                   offset=0,
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
+								                   fields=["docnm_kwd", "content_with_weight", "img_id"],
 								                   sort_by_position: bool = False):
-												Integration with Infinity (#2894)

### What problem does this PR solve?

Integration with Infinity

- Replaced ELASTICSEARCH with dataStoreConn
- Renamed deleteByQuery with delete
- Renamed bulk to upsertBulk
- getHighlight, getAggregation
- Fix KGSearch.search
- Moved Dealer.sql_retrieval to es_conn.py


### Type of change

- [x] Refactoring
											
										
										
											2024-11-12 14:59:41 +08:00
+								        condition = {"doc_id": doc_id}
-												Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve?

#9869

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Signed-off-by: dependabot[bot] <support@github.com>
Signed-off-by: jinhai <haijin.chn@gmail.com>
Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: chanx <1243304602@qq.com>
Co-authored-by: balibabu <cike8899@users.noreply.github.com>
Co-authored-by: Lynn <lynn_inf@hotmail.com>
Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com>
Co-authored-by: huangzl <huangzl@shinemo.com>
Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com>
Co-authored-by: Wilmer <33392318@qq.com>
Co-authored-by: Adrian Weidig <adrianweidig@gmx.net>
Co-authored-by: Zhichang Yu <yuzhichang@gmail.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Co-authored-by: Yongteng Lei <yongtengrey@outlook.com>
Co-authored-by: Liu An <asiro@qq.com>
Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com>
Co-authored-by: BadwomanCraZY <511528396@qq.com>
Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com>
Co-authored-by: Russell Valentine <russ@coldstonelabs.org>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Billy Bao <newyorkupperbay@gmail.com>
Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com>
Co-authored-by: TensorNull <tensor.null@gmail.com>
Co-authored-by: TeslaZY <TeslaZY@outlook.com>
Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com>
Co-authored-by: AB <aj@Ajays-MacBook-Air.local>
Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com>
Co-authored-by: He Wang <wanghechn@qq.com>
Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com>
Co-authored-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com>
Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box>
Co-authored-by: Stephen Hu <stephenhu@seismic.com>
Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com>
Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com>
Co-authored-by: mxc <mxc@example.com>
Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com>
Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com>
Co-authored-by: mcoder6425 <mcoder64@gmail.com>
Co-authored-by: lemsn <lemsn@msn.com>
Co-authored-by: lemsn <lemsn@126.com>
Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com>
Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com>
Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
											
										
										
											2025-10-09 12:36:19 +08:00
 								        fields_set = set(fields or [])
 								        if sort_by_position:
 								            for need in ("page_num_int", "position_int", "top_int"):
 								                if need not in fields_set:
 								                    fields_set.add(need)
 								        fields = list(fields_set)
 								        orderBy = OrderByExpr()
 								        if sort_by_position:
 								            orderBy.asc("page_num_int")
 								            orderBy.asc("position_int")
 								            orderBy.asc("top_int")
-												Fetch chunk by batches. (#4177)

### What problem does this PR solve?

#4173

### Type of change

- [x] Performance Improvement
											
										
										
											2024-12-23 12:12:15 +08:00
+								        res = []
 								        bs = 128
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								        for p in range(offset, max_count, bs):
-												Fix: add soft limit for graph rag size (#13252)

### What problem does this PR solve?

Fix: add soft limit for graph rag size #13258 Q2

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

---------

Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
											
										
										
											2026-03-02 14:02:36 +08:00
+								            limit = min(bs, max_count - p)
 								            if limit <= 0:
 								                break
 								            es_res = self.dataStore.search(fields, [], condition, [], orderBy, p, limit, index_name(tenant_id),
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								                                           kb_ids)
-												Refactor function name (#11210)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-12 19:00:15 +08:00
+								            dict_chunks = self.dataStore.get_fields(es_res, fields)
-												Light GraphRAG (#4585)

### What problem does this PR solve?

#4543

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-22 19:43:14 +08:00
+								            for id, doc in dict_chunks.items():
 								                doc["id"] = id
-												Fetch chunk by batches. (#4177)

### What problem does this PR solve?

#4173

### Type of change

- [x] Performance Improvement
											
										
										
											2024-12-23 12:12:15 +08:00
+								            if dict_chunks:
 								                res.extend(dict_chunks.values())
-												Fix: add soft limit for graph rag size (#13252)

### What problem does this PR solve?

Fix: add soft limit for graph rag size #13258 Q2

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

---------

Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
											
										
										
											2026-03-02 14:02:36 +08:00
+								            chunk_count = len(dict_chunks)
 								            if chunk_count == 0 or chunk_count < limit:
-												Fetch chunk by batches. (#4177)

### What problem does this PR solve?

#4173

### Type of change

- [x] Performance Improvement
											
										
										
											2024-12-23 12:12:15 +08:00
+								                break
 								        return res
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
 								    def all_tags(self, tenant_id: str, kb_ids: list[str], S=1000):
-												Feat: message manage (#12196)

### What problem does this PR solve?

Manage message and use in agent.

Issue #4213 

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-25 21:18:13 +08:00
+								        if not self.dataStore.index_exist(index_name(tenant_id), kb_ids[0]):
-												Ignore exceptions when no index ahead. (#5047)

### What problem does this PR solve?

### Type of change

- [x] Refactoring
											
										
										
											2025-02-18 09:09:22 +08:00
+								            return []
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								        res = self.dataStore.search([], [], {}, [], OrderByExpr(), 0, 0, index_name(tenant_id), kb_ids, ["tag_kwd"])
-												Refactor function name (#11210)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-12 19:00:15 +08:00
+								        return self.dataStore.get_aggregation(res, "tag_kwd")
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
 								    def all_tags_in_portion(self, tenant_id: str, kb_ids: list[str], S=1000):
 								        res = self.dataStore.search([], [], {}, [], OrderByExpr(), 0, 0, index_name(tenant_id), kb_ids, ["tag_kwd"])
-												Refactor function name (#11210)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-12 19:00:15 +08:00
+								        res = self.dataStore.get_aggregation(res, "tag_kwd")
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								        total = np.sum([c for _, c in res])
 								        return {t: (c + 1) / (total + S) for t, c in res}
 								    def tag_content(self, tenant_id: str, kb_ids: list[str], doc, all_tags, topn_tags=3, keywords_topn=30, S=1000):
 								        idx_nm = index_name(tenant_id)
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								        match_txt = self.qryr.paragraph(doc["title_tks"] + " " + doc["content_ltks"], doc.get("important_kwd", []),
 								                                        keywords_topn)
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								        res = self.dataStore.search([], [], {}, [match_txt], OrderByExpr(), 0, 0, idx_nm, kb_ids, ["tag_kwd"])
-												Refactor function name (#11210)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-12 19:00:15 +08:00
+								        aggs = self.dataStore.get_aggregation(res, "tag_kwd")
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								        if not aggs:
 								            return False
 								        cnt = np.sum([c for _, c in aggs])
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								        tag_fea = sorted([(a, round(0.1 * (c + 1) / (cnt + S) / max(1e-6, all_tags.get(a, 0.0001)))) for a, c in aggs],
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								                         key=lambda x: x[1] * -1)[:topn_tags]
-												Fix: point in tag issue. (#6436)

### What problem does this PR solve?

#6414

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-03-24 10:45:29 +08:00
+								        doc[TAG_FLD] = {a.replace(".", "_"): c for a, c in tag_fea if c > 0}
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								        return True
 								    def tag_query(self, question: str, tenant_ids: str | list[str], kb_ids: list[str], all_tags, topn_tags=3, S=1000):
 								        if isinstance(tenant_ids, str):
 								            idx_nms = index_name(tenant_ids)
 								        else:
 								            idx_nms = [index_name(tid) for tid in tenant_ids]
 								        match_txt, _ = self.qryr.question(question, min_match=0.0)
 								        res = self.dataStore.search([], [], {}, [match_txt], OrderByExpr(), 0, 0, idx_nms, kb_ids, ["tag_kwd"])
-												Refactor function name (#11210)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-12 19:00:15 +08:00
+								        aggs = self.dataStore.get_aggregation(res, "tag_kwd")
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								        if not aggs:
 								            return {}
 								        cnt = np.sum([c for _, c in aggs])
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								        tag_fea = sorted([(a, round(0.1 * (c + 1) / (cnt + S) / max(1e-6, all_tags.get(a, 0.0001)))) for a, c in aggs],
-												Tagging (#4426)

### What problem does this PR solve?

#4367

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-01-09 17:07:21 +08:00
+								                         key=lambda x: x[1] * -1)[:topn_tags]
-												Fix: point in tag issue. (#6436)

### What problem does this PR solve?

#6414

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-03-24 10:45:29 +08:00
+								        return {a.replace(".", "_"): max(1, c) for a, c in tag_fea}
-												Feat: TOC retrieval (#10456)

### What problem does this PR solve?

#10436

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-10-10 17:07:55 +08:00
-												Fix: toc async issue. (#12485)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-01-07 15:35:30 +08:00
+								    async def retrieval_by_toc(self, query: str, chunks: list[dict], tenant_ids: list[str], chat_mdl, topn: int = 6):
-												Support operator constraints in semi-automatic metadata filtering (#12956)

### What problem does this PR solve?

#### Summary
This PR enhances the Semi-automatic metadata filtering mode by allowing
users to explicitly pre-define operators (e.g., contains, =, >, etc.)
for selected metadata keys. While the LLM still dynamically extracts the
filter value from the user's query, it is now strictly constrained to
use the operator specified in the UI configuration.

Using this feature is optional. By default the operator selection is set
to "automatic" resulting in the LLM choosing the operator (as
presently).

#### Rationale & Use Case
This enhancement was driven by a concrete challenge I encountered while
working with technical documentation.
In my specific use case, I was trying to filter for software versions
within a technical manual. In this dataset, a single document chunk
often applies to multiple software versions. These versions are stored
as a combined string within the metadata for each chunk.

When using the standard semi-automatic filter, the LLM would
inconsistently choose between the contains and equals operators. When it
chose equals, it would exclude every chunk that applied to more than one
version, even if the version I was searching for was clearly included in
that metadata string. This led to incomplete and frustrating retrieval
results.

By extending the semi-automatic filter to allow pre-defining the
operator for a specific key, I was able to force the use of contains for
the version field. This change immediately led to significantly improved
and more reliable results in my case.

I believe this functionality will be equally useful for others dealing
with "tagged" or multi-value metadata where the relationship between the
query and the field is known, but the specific value needs to remain
dynamic.

#### Key Changes
##### Backend & Core Logic
- `common/metadata_utils.py`: Updated apply_meta_data_filter to support
a mixed data structure for semi_auto (handling both legacy string arrays
and the new object-based format {"key": "...", "op": "..."}).
- `rag/prompts/generator.py`: Extended gen_meta_filter to accept and
pass operator constraints to the LLM.
- `rag/prompts/meta_filter.md`: Updated the system prompt to instruct
the LLM to strictly respect provided operator constraints.

##### Frontend
- `web/src/components/metadata-filter/metadata-semi-auto-fields.tsx`:
Enhanced the UI to include an operator dropdown for each selected
metadata key, utilizing existing operator constants.
- `web/src/components/metadata-filter/index.tsx`: Updated the validation
schema to accommodate the new state structure.

#### Test Plan
- Backward Compatibility: Verified that existing semi-auto filters
stored as simple strings still function correctly.
- Prompt Verification: Confirmed that constraints are correctly rendered
in the LLM system prompt when specified.
- Added unit tests as
`test/unit_test/common/test_apply_semi_auto_meta_data_filter.py`
 - Manual End-to-End:
- Configured a "Semi-automatic" filter for a "Version" key with the
"contains" operator.
   - Asked a version-specific query.
   - Result
   
<img width="1173" height="704" alt="Screenshot 2026-02-02 145359"
src="https://github.com/user-attachments/assets/510a6a61-a231-4dc2-a7fe-cdfc07219132"
/>




### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):

---------

Co-authored-by: Philipp Heyken Soares <philipp.heyken-soares@am.ai>
											
										
										
											2026-02-03 04:11:34 +01:00
+								        from rag.prompts.generator import relevant_chunks_with_toc # moved from the top of the file to avoid circular import
-												Feat: TOC retrieval (#10456)

### What problem does this PR solve?

#10436

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-10-10 17:07:55 +08:00
+								        if not chunks:
 								            return []
 								        idx_nms = [index_name(tid) for tid in tenant_ids]
 								        ranks, doc_id2kb_id = {}, {}
 								        for ck in chunks:
 								            if ck["doc_id"] not in ranks:
 								                ranks[ck["doc_id"]] = 0
 								            ranks[ck["doc_id"]] += ck["similarity"]
 								            doc_id2kb_id[ck["doc_id"]] = ck["kb_id"]
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								        doc_id = sorted(ranks.items(), key=lambda x: x[1] * -1.)[0][0]
-												Feat: TOC retrieval (#10456)

### What problem does this PR solve?

#10436

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-10-10 17:07:55 +08:00
+								        kb_ids = [doc_id2kb_id[doc_id]]
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								        es_res = self.dataStore.search(["content_with_weight"], [], {"doc_id": doc_id, "toc_kwd": "toc"}, [],
 								                                       OrderByExpr(), 0, 128, idx_nms,
-												Feat: TOC retrieval (#10456)

### What problem does this PR solve?

#10436

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-10-10 17:07:55 +08:00
+								                                       kb_ids)
 								        toc = []
-												Refactor function name (#11210)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-11-12 19:00:15 +08:00
+								        dict_chunks = self.dataStore.get_fields(es_res, ["content_with_weight"])
-												Feat: TOC retrieval (#10456)

### What problem does this PR solve?

#10436

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-10-10 17:07:55 +08:00
+								        for _, doc in dict_chunks.items():
 								            try:
 								                toc.extend(json.loads(doc["content_with_weight"]))
 								            except Exception as e:
 								                logging.exception(e)
 								        if not toc:
 								            return chunks
-												Fix: toc async issue. (#12485)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-01-07 15:35:30 +08:00
+								        ids = await relevant_chunks_with_toc(query, toc, chat_mdl, topn * 2)
-												Feat: TOC retrieval (#10456)

### What problem does this PR solve?

#10436

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-10-10 17:07:55 +08:00
+								        if not ids:
 								            return chunks
-												Fix: incorrect retrieval total count with pagination enabled (#11400)

### What problem does this PR solve?

Incorrect retrieval total count with pagination enabled.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-11-20 15:35:09 +08:00
-												Feat: TOC retrieval (#10456)

### What problem does this PR solve?

#10436

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-10-10 17:07:55 +08:00
+								        vector_size = 1024
 								        id2idx = {ck["chunk_id"]: i for i, ck in enumerate(chunks)}
 								        for cid, sim in ids:
 								            if cid in id2idx:
 								                chunks[id2idx[cid]]["similarity"] += sim
 								                continue
-												Fix parameter of calling self.dataStore.get() and warning info during parser (#13068)

### What problem does this PR solve?

Fix parameter of calling self.dataStore.get() and warning info during
parser

https://github.com/infiniflow/ragflow/issues/13036

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-02-09 17:56:59 +08:00
+								            chunk = self.dataStore.get(cid, idx_nms[0], kb_ids)
-												Fix: toc no chunk found issue. (#12197)

### What problem does this PR solve?

#12170

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-25 14:06:20 +08:00
+								            if not chunk:
 								                continue
-												Feat: TOC retrieval (#10456)

### What problem does this PR solve?

#10436

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-10-10 17:07:55 +08:00
+								            d = {
 								                "chunk_id": cid,
 								                "content_ltks": chunk["content_ltks"],
 								                "content_with_weight": chunk["content_with_weight"],
 								                "doc_id": doc_id,
 								                "docnm_kwd": chunk.get("docnm_kwd", ""),
 								                "kb_id": chunk["kb_id"],
 								                "important_kwd": chunk.get("important_kwd", []),
 								                "image_id": chunk.get("img_id", ""),
 								                "similarity": sim,
 								                "vector_similarity": sim,
 								                "term_similarity": sim,
 								                "vector": [0.0] * vector_size,
 								                "positions": chunk.get("position_int", []),
 								                "doc_type_kwd": chunk.get("doc_type_kwd", "")
 								            }
 								            for k in chunk.keys():
 								                if k[-4:] == "_vec":
 								                    d["vector"] = chunk[k]
 								                    vector_size = len(chunk[k])
 								                    break
 								            chunks.append(d)
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								        return sorted(chunks, key=lambda x: x["similarity"] * -1)[:topn]
-												Feat: support parent-child in search procedure. (#11629)

### What problem does this PR solve?

#7996

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-01 14:03:09 +08:00
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								    def retrieval_by_children(self, chunks: list[dict], tenant_ids: list[str]):
-												Feat: support parent-child in search procedure. (#11629)

### What problem does this PR solve?

#7996

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-01 14:03:09 +08:00
+								        if not chunks:
 								            return []
 								        idx_nms = [index_name(tid) for tid in tenant_ids]
-												Feat: support uploading in dialog. (#11634)

### What problem does this PR solve?

#9590

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-01 16:54:57 +08:00
+								        mom_chunks = defaultdict(list)
-												Feat: support parent-child in search procedure. (#11629)

### What problem does this PR solve?

#7996

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-01 14:03:09 +08:00
+								        i = 0
 								        while i < len(chunks):
 								            ck = chunks[i]
-												Fix: parent-child chunking method (#11810)

### What problem does this PR solve?

change:
parent-child chunking method

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2025-12-09 09:34:01 +08:00
+								            mom_id = ck.get("mom_id")
 								            if not isinstance(mom_id, str) or not mom_id.strip():
-												Feat: support parent-child in search procedure. (#11629)

### What problem does this PR solve?

#7996

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-01 14:03:09 +08:00
+								                i += 1
 								                continue
 								            mom_chunks[ck["mom_id"]].append(chunks.pop(i))
 								        if not mom_chunks:
 								            return chunks
 								        if not chunks:
 								            chunks = []
 								        vector_size = 1024
 								        for id, cks in mom_chunks.items():
-												Fix parameter of calling self.dataStore.get() and warning info during parser (#13068)

### What problem does this PR solve?

Fix parameter of calling self.dataStore.get() and warning info during
parser

https://github.com/infiniflow/ragflow/issues/13036

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-02-09 17:56:59 +08:00
+								            chunk = self.dataStore.get(id, idx_nms[0], [ck["kb_id"] for ck in cks])
-												Feat: support parent-child in search procedure. (#11629)

### What problem does this PR solve?

#7996

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
											
										
										
											2025-12-01 14:03:09 +08:00
+								            d = {
 								                "chunk_id": id,
 								                "content_ltks": " ".join([ck["content_ltks"] for ck in cks]),
 								                "content_with_weight": chunk["content_with_weight"],
 								                "doc_id": chunk["doc_id"],
 								                "docnm_kwd": chunk.get("docnm_kwd", ""),
 								                "kb_id": chunk["kb_id"],
 								                "important_kwd": [kwd for ck in cks for kwd in ck.get("important_kwd", [])],
 								                "image_id": chunk.get("img_id", ""),
 								                "similarity": np.mean([ck["similarity"] for ck in cks]),
 								                "vector_similarity": np.mean([ck["similarity"] for ck in cks]),
 								                "term_similarity": np.mean([ck["similarity"] for ck in cks]),
 								                "vector": [0.0] * vector_size,
 								                "positions": chunk.get("position_int", []),
 								                "doc_type_kwd": chunk.get("doc_type_kwd", "")
 								            }
 								            for k in cks[0].keys():
 								                if k[-4:] == "_vec":
 								                    d["vector"] = cks[0][k]
 								                    vector_size = len(cks[0][k])
 								                    break
 								            chunks.append(d)
-												Fix IDE warnings (#12281)

### What problem does this PR solve?

As title

### Type of change

- [x] Refactoring

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
											
										
										
											2025-12-29 12:01:18 +08:00
+								        return sorted(chunks, key=lambda x: x["similarity"] * -1)