diff --git a/memory/utils/highlight_utils.py b/memory/utils/highlight_utils.py new file mode 100644 index 0000000000..977fbe3a0f --- /dev/null +++ b/memory/utils/highlight_utils.py @@ -0,0 +1,89 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use it except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Highlight helpers for search results (wraps keywords in ).""" + +import re +from collections.abc import Callable + + +def highlight_text( + txt: str, + keywords: list[str], + is_english_fn: Callable[[str], bool] | None = None, +) -> str: + """Wrap keyword matches in text with , by sentence. + + - If is_english_fn(sentence) is True: use word-boundary regex. + - Otherwise: literal replace (longest keywords first). + Only sentences that contain a match are included. + """ + if not txt or not keywords: + return "" + + txt = re.sub(r"[\r\n]", " ", txt, flags=re.IGNORECASE | re.MULTILINE) + txt_list = [] + + for t in re.split(r"[.?!;\n]", txt): + t = t.strip() + if not t: + continue + + if is_english_fn is None or is_english_fn(t): + for w in keywords: + t = re.sub( + r"(^|[ .?/'\"\(\)!,:;-])(%s)([ .?/'\"\(\)!,:;-]|$)" % re.escape(w), + r"\1\2\3", + t, + flags=re.IGNORECASE | re.MULTILINE, + ) + else: + for w in sorted(keywords, key=len, reverse=True): + t = re.sub( + re.escape(w), + f"{w}", + t, + flags=re.IGNORECASE | re.MULTILINE, + ) + + if re.search(r"[^<>]+", t, flags=re.IGNORECASE | re.MULTILINE): + txt_list.append(t) + + return "...".join(txt_list) if txt_list else txt + + +def get_highlight_from_messages( + messages: list[dict] | None, + keywords: list[str], + field_name: str, + is_english_fn: Callable[[str], bool] | None = None, +) -> dict[str, str]: + """Build id -> highlighted text from a list of message dicts.""" + if not messages or not keywords: + return {} + + ans = {} + for doc in messages: + doc_id = doc.get("id") + if not doc_id: + continue + txt = doc.get(field_name) + if not txt or not isinstance(txt, str): + continue + highlighted = highlight_text(txt, keywords, is_english_fn) + if highlighted and re.search(r"[^<>]+", highlighted, flags=re.IGNORECASE | re.MULTILINE): + ans[doc_id] = highlighted + return ans diff --git a/memory/utils/ob_conn.py b/memory/utils/ob_conn.py index 09c976e2ca..f179992373 100644 --- a/memory/utils/ob_conn.py +++ b/memory/utils/ob_conn.py @@ -25,9 +25,11 @@ from sqlalchemy.dialects.mysql import LONGTEXT from common.decorator import singleton from memory.utils.aggregation_utils import aggregate_by_field +from memory.utils.highlight_utils import get_highlight_from_messages from common.doc_store.doc_store_base import MatchExpr, OrderByExpr, FusionExpr, MatchTextExpr, MatchDenseExpr from common.doc_store.ob_conn_base import OBConnectionBase, get_value_str, vector_search_template from common.float_utils import get_float +from rag.nlp import is_english from rag.nlp.rag_tokenizer import tokenize, fine_grained_tokenize # Column definitions for memory message table @@ -605,8 +607,12 @@ class OBConnection(OBConnectionBase): def get_highlight(self, res, keywords: list[str], field_name: str): """Get highlighted text for search results.""" - # TODO: Implement highlight functionality for OceanBase memory - return {} + if isinstance(res, tuple): + res = res[0] + messages = getattr(res, "messages", None) + return get_highlight_from_messages( + messages, keywords, field_name, is_english_fn=lambda s: is_english([s]) + ) def get_aggregation(self, res, field_name: str): """Get aggregation for search results.""" diff --git a/test/unit_test/memory/utils/test_ob_conn_highlight.py b/test/unit_test/memory/utils/test_ob_conn_highlight.py new file mode 100644 index 0000000000..99550cf011 --- /dev/null +++ b/test/unit_test/memory/utils/test_ob_conn_highlight.py @@ -0,0 +1,79 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use it except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Unit tests for OceanBase memory get_highlight. + +Tests the pure highlight logic used by OBConnection.get_highlight, +without requiring a real OceanBase instance or heavy dependencies. +""" + +from memory.utils.highlight_utils import get_highlight_from_messages, highlight_text + + +class TestHighlightText: + """Tests for highlight_text (word-boundary mode when is_english_fn is None).""" + + def test_empty_text_returns_empty(self): + assert highlight_text("", ["foo"]) == "" + assert highlight_text("hello", []) == "" + + def test_wraps_keyword_with_em(self): + out = highlight_text("The quick brown fox.", ["quick"], None) + assert "quick" in out + assert "The" in out and "brown fox" in out + + def test_only_sentences_with_match_included(self): + out = highlight_text( + "First sentence. Second has keyword. Third none.", + ["keyword"], + None, + ) + assert "Second has keyword" in out + assert "First sentence" not in out and "Third none" not in out + + def test_multiple_keywords(self): + out = highlight_text("Alpha and beta here.", ["Alpha", "beta"], None) + assert "Alpha" in out and "beta" in out + + +class TestGetHighlightFromMessages: + """Tests for get_highlight_from_messages (used by get_highlight).""" + + def test_empty_messages_returns_empty_dict(self): + assert get_highlight_from_messages([], ["k"], "content_ltks") == {} + assert get_highlight_from_messages(None, ["k"], "content_ltks") == {} + + def test_empty_keywords_returns_empty_dict(self): + assert get_highlight_from_messages( + [{"id": "m1", "content_ltks": "hello"}], [], "content_ltks" + ) == {} + + def test_returns_id_to_highlighted_text(self): + messages = [ + {"id": "msg1", "content_ltks": "The cat sat."}, + {"id": "msg2", "content_ltks": "The dog ran."}, + ] + out = get_highlight_from_messages(messages, ["cat"], "content_ltks") + assert list(out.keys()) == ["msg1"] + assert "cat" in out["msg1"] + out2 = get_highlight_from_messages(messages, ["dog"], "content_ltks") + assert list(out2.keys()) == ["msg2"] + assert "dog" in out2["msg2"] + + def test_skips_docs_without_field(self): + messages = [{"id": "m1"}, {"id": "m2", "content_ltks": "hello world."}] + out = get_highlight_from_messages(messages, ["hello"], "content_ltks") + assert "m2" in out and "hello" in out["m2"]