Fix one data source can be synced to multiple dataset (#16023)

Fix one data source can be synced to multiple dataset Test add/delete - worked.
2026-06-29 15:31:05 +08:00 · 2026-06-15 16:54:25 +08:00
parent fa6d29603a
commit f6a2075ad0
4 changed files with 106 additions and 5 deletions
--- a/rag/svr/sync_data_source.py
+++ b/rag/svr/sync_data_source.py
@@ -232,8 +232,10 @@ class SyncBase:

            docs = []
            for doc in document_batch:
+                legacy_doc_id = hash128(f"{task['connector_id']}:{doc.id}")
+                new_doc_id = hash128(f"{task['kb_id']}:{task['connector_id']}:{doc.id}")
                d = {
-                    "id": hash128(f"{task['connector_id']}:{doc.id}"),
+                    "id": legacy_doc_id if legacy_doc_id in existing_doc_ids else new_doc_id,
                    "connector_id": task["connector_id"],
                    "source": self.SOURCE_NAME,
                    "semantic_identifier": doc.semantic_identifier,
@@ -401,8 +403,9 @@ class _BlobLikeBase(SyncBase):
            if key_record.deleted:
                continue

-            doc_id = hash128(key_record.key)
-            stored = existing_fingerprints.get(doc_id, "")
+            legacy_doc_id = hash128(f"{task['connector_id']}:{key_record.key}")
+            new_doc_id = hash128(f"{task['kb_id']}:{task['connector_id']}:{key_record.key}")
+            stored = existing_fingerprints.get(legacy_doc_id, "") or existing_fingerprints.get(new_doc_id, "")
            if key_record.fingerprint and stored and key_record.fingerprint == stored:
                bypass_count += 1
                continue