mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
fix(jira): prevent missed incremental updates after issue edits (#13674)
### What problem does this PR solve? Fixes [#13505](https://github.com/infiniflow/ragflow/issues/13505): Jira incremental sync could miss updated issues after initial sync, especially near time boundaries. Root cause: - Jira JQL uses minute-level precision for `updated` filters. - Incremental windows had no overlap buffer, so boundary updates could be skipped. - Sync log cursor tracking used a backward-facing update for `poll_range_start`. - Existing-doc updates in `upload_document` lacked a KB ownership guard for doc-id collisions. What changed: - Added Jira incremental overlap buffer (`time_buffer_seconds`, defaulting to `JIRA_SYNC_TIME_BUFFER_SECONDS`) when building JQL lower-bound time. - Preserved second-level post-filtering to avoid duplicate reprocessing while still catching boundary updates. - Improved Jira sync logging to include start/end window and overlap configuration. - Updated sync cursor tracking in `increase_docs` to keep `poll_range_start` moving forward with max update time. - Added KB ID safety check before updating existing document records in `upload_document`. Verification performed: - Python syntax compile checks passed for modified files. - Manual verification flow: 1. Run full Jira sync. 2. Edit an already-indexed Jira issue. 3. Run next incremental sync. 4. Confirm updated content is re-ingested into KB. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --------- Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -183,10 +183,11 @@ class SyncLogsService(CommonService):
|
||||
ConnectorService.update_by_id(connector_id, {"status": TaskStatus.SCHEDULE})
|
||||
|
||||
@classmethod
|
||||
def increase_docs(cls, id, min_update, max_update, doc_num, err_msg="", error_count=0):
|
||||
def increase_docs(cls, id, max_update, doc_num, err_msg="", error_count=0):
|
||||
# Keep sync monotonic.
|
||||
cls.model.update(new_docs_indexed=cls.model.new_docs_indexed + doc_num,
|
||||
total_docs_indexed=cls.model.total_docs_indexed + doc_num,
|
||||
poll_range_start=fn.COALESCE(fn.LEAST(cls.model.poll_range_start,min_update), min_update),
|
||||
poll_range_start=fn.COALESCE(fn.GREATEST(cls.model.poll_range_start, max_update), max_update),
|
||||
poll_range_end=fn.COALESCE(fn.GREATEST(cls.model.poll_range_end, max_update), max_update),
|
||||
error_msg=cls.model.error_msg + err_msg,
|
||||
error_count=cls.model.error_count + error_count,
|
||||
|
||||
@@ -444,6 +444,17 @@ class FileService(CommonService):
|
||||
e, doc = DocumentService.get_by_id(doc_id)
|
||||
if e:
|
||||
try:
|
||||
if str(doc.kb_id) != str(kb.id):
|
||||
logging.warning(
|
||||
"Existing document id collision detected for %s: belongs to kb_id=%s, incoming kb_id=%s. "
|
||||
"Skipping update to avoid cross-KB overwrite.",
|
||||
doc_id,
|
||||
doc.kb_id,
|
||||
kb.id,
|
||||
)
|
||||
user_msg = "Existing document id collision with another knowledge base; skipping update."
|
||||
err.append(file.filename + ": " + user_msg)
|
||||
continue
|
||||
blob = file.read()
|
||||
new_hash = xxhash.xxh128(blob).hexdigest()
|
||||
old_hash = doc.content_hash or ""
|
||||
|
||||
@@ -20,6 +20,7 @@ from common.data_source.config import (
|
||||
INDEX_BATCH_SIZE,
|
||||
JIRA_CONNECTOR_LABELS_TO_SKIP,
|
||||
JIRA_CONNECTOR_MAX_TICKET_SIZE,
|
||||
JIRA_SYNC_TIME_BUFFER_SECONDS,
|
||||
JIRA_TIMEZONE_OFFSET,
|
||||
ONE_HOUR,
|
||||
DocumentSource,
|
||||
@@ -95,6 +96,7 @@ class JiraConnector(CheckpointedConnectorWithPermSync, SlimConnectorWithPermSync
|
||||
scoped_token: bool = False,
|
||||
attachment_size_limit: int | None = None,
|
||||
timezone_offset: float | None = None,
|
||||
time_buffer_seconds: int | None = JIRA_SYNC_TIME_BUFFER_SECONDS,
|
||||
) -> None:
|
||||
if not jira_base_url:
|
||||
raise ConnectorValidationError("Jira base URL must be provided.")
|
||||
@@ -120,6 +122,16 @@ class JiraConnector(CheckpointedConnectorWithPermSync, SlimConnectorWithPermSync
|
||||
self.timezone_offset = tz_offset_value
|
||||
self.timezone = timezone(offset=timedelta(hours=tz_offset_value))
|
||||
self._timezone_overridden = timezone_offset is not None
|
||||
if time_buffer_seconds is None:
|
||||
buffer_value = JIRA_SYNC_TIME_BUFFER_SECONDS
|
||||
else:
|
||||
try:
|
||||
buffer_value = int(time_buffer_seconds)
|
||||
except (TypeError, ValueError) as exc:
|
||||
raise ConnectorValidationError(
|
||||
f"Invalid time_buffer_seconds value ({time_buffer_seconds!r}); expected an integer."
|
||||
) from exc
|
||||
self.time_buffer_seconds = max(0, buffer_value)
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# Connector lifecycle helpers
|
||||
@@ -245,7 +257,16 @@ class JiraConnector(CheckpointedConnectorWithPermSync, SlimConnectorWithPermSync
|
||||
while True:
|
||||
attempt += 1
|
||||
jql = self._build_jql(attempt_start, end)
|
||||
logger.info(f"[Jira] Executing Jira JQL attempt {attempt} (buffered_retry={retried_with_buffer})[start and end parameters redacted]")
|
||||
adjusted_start = self._adjust_start_for_query(attempt_start)
|
||||
logger.info(
|
||||
"[Jira] Executing Jira JQL attempt %s (buffered_retry=%s, start=%s, adjusted_start=%s, end=%s, overlap_buffer_s=%s)",
|
||||
attempt,
|
||||
retried_with_buffer,
|
||||
attempt_start,
|
||||
adjusted_start,
|
||||
end,
|
||||
self.time_buffer_seconds,
|
||||
)
|
||||
try:
|
||||
return (yield from self._load_from_checkpoint_internal(jql, checkpoint, start_filter=start))
|
||||
except Exception as exc:
|
||||
@@ -424,8 +445,9 @@ class JiraConnector(CheckpointedConnectorWithPermSync, SlimConnectorWithPermSync
|
||||
labels = ", ".join(f'"{label}"' for label in self.labels_to_skip)
|
||||
clauses.append(f"labels NOT IN ({labels})")
|
||||
|
||||
if start is not None:
|
||||
clauses.append(f'updated >= "{self._format_jql_time(start)}"')
|
||||
adjusted_start = self._adjust_start_for_query(start)
|
||||
if adjusted_start is not None:
|
||||
clauses.append(f'updated >= "{self._format_jql_time(adjusted_start)}"')
|
||||
if end is not None:
|
||||
clauses.append(f'updated <= "{self._format_jql_time(end)}"')
|
||||
|
||||
@@ -437,6 +459,17 @@ class JiraConnector(CheckpointedConnectorWithPermSync, SlimConnectorWithPermSync
|
||||
jql = f"{jql} ORDER BY updated ASC"
|
||||
return jql
|
||||
|
||||
def _adjust_start_for_query(self, start: SecondsSinceUnixEpoch | None) -> SecondsSinceUnixEpoch | None:
|
||||
"""Apply a small overlap buffer to protect against minute-precision JQL boundaries."""
|
||||
if start is None:
|
||||
return None
|
||||
start_value = float(start)
|
||||
if start_value <= 0:
|
||||
return start_value
|
||||
if self.time_buffer_seconds <= 0:
|
||||
return start_value
|
||||
return max(0.0, start_value - float(self.time_buffer_seconds))
|
||||
|
||||
def _format_jql_time(self, timestamp: SecondsSinceUnixEpoch) -> str:
|
||||
dt_utc = datetime.fromtimestamp(float(timestamp), tz=timezone.utc)
|
||||
dt_local = dt_utc.astimezone(self.timezone)
|
||||
|
||||
@@ -123,7 +123,6 @@ class SyncBase:
|
||||
if not document_batch:
|
||||
continue
|
||||
|
||||
min_update = min(doc.doc_updated_at for doc in document_batch)
|
||||
max_update = max(doc.doc_updated_at for doc in document_batch)
|
||||
next_update = max(next_update, max_update)
|
||||
|
||||
@@ -151,7 +150,7 @@ class SyncBase:
|
||||
task["auto_parse"]
|
||||
)
|
||||
SyncLogsService.increase_docs(
|
||||
task["id"], min_update, max_update,
|
||||
task["id"], max_update,
|
||||
len(docs), "\n".join(err), len(err)
|
||||
)
|
||||
|
||||
@@ -577,6 +576,7 @@ class Jira(SyncBase):
|
||||
"scoped_token": self.conf.get("scoped_token", False),
|
||||
"attachment_size_limit": self.conf.get("attachment_size_limit"),
|
||||
"timezone_offset": self.conf.get("timezone_offset"),
|
||||
"time_buffer_seconds": self.conf.get("time_buffer_seconds"),
|
||||
}
|
||||
|
||||
self.connector = JiraConnector(**connector_kwargs)
|
||||
@@ -642,7 +642,15 @@ class Jira(SyncBase):
|
||||
if pending_docs:
|
||||
yield pending_docs
|
||||
|
||||
logging.info(f"[Jira] Connect to Jira {connector_kwargs['jira_base_url']} {begin_info}")
|
||||
logging.info(
|
||||
"[Jira] Connect to Jira %s %s (start=%s, end=%s, sync_batch_size=%s, overlap_buffer_s=%s)",
|
||||
connector_kwargs["jira_base_url"],
|
||||
begin_info,
|
||||
start_time,
|
||||
end_time,
|
||||
batch_size,
|
||||
getattr(self.connector, "time_buffer_seconds", connector_kwargs.get("time_buffer_seconds")),
|
||||
)
|
||||
return document_batches()
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -0,0 +1,122 @@
|
||||
#
|
||||
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import importlib.util
|
||||
import sys
|
||||
import types
|
||||
import warnings
|
||||
from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
message="pkg_resources is deprecated as an API.*",
|
||||
category=UserWarning,
|
||||
)
|
||||
|
||||
|
||||
def _install_cv2_stub_if_unavailable():
|
||||
try:
|
||||
importlib.import_module("cv2")
|
||||
return
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
stub = types.ModuleType("cv2")
|
||||
stub.INTER_LINEAR = 1
|
||||
stub.INTER_CUBIC = 2
|
||||
stub.BORDER_CONSTANT = 0
|
||||
stub.BORDER_REPLICATE = 1
|
||||
|
||||
def _missing(*_args, **_kwargs):
|
||||
raise RuntimeError("cv2 runtime call is unavailable in this test environment")
|
||||
|
||||
def _module_getattr(name):
|
||||
if name.isupper():
|
||||
return 0
|
||||
return _missing
|
||||
|
||||
stub.__getattr__ = _module_getattr
|
||||
sys.modules["cv2"] = stub
|
||||
|
||||
|
||||
def _install_xgboost_stub_if_unavailable():
|
||||
if "xgboost" in sys.modules:
|
||||
return
|
||||
if importlib.util.find_spec("xgboost") is not None:
|
||||
return
|
||||
sys.modules["xgboost"] = types.ModuleType("xgboost")
|
||||
|
||||
|
||||
_install_cv2_stub_if_unavailable()
|
||||
_install_xgboost_stub_if_unavailable()
|
||||
|
||||
from api.db.services import file_service as file_service_module # noqa: E402
|
||||
from api.db.services.file_service import FileService # noqa: E402
|
||||
|
||||
|
||||
class _DummyUploadFile:
|
||||
def __init__(self, filename, doc_id):
|
||||
self.filename = filename
|
||||
self.id = doc_id
|
||||
|
||||
def read(self):
|
||||
raise AssertionError("read() should not be called for cross-KB collision path")
|
||||
|
||||
|
||||
def _unwrapped_upload_document():
|
||||
return FileService.upload_document.__func__.__wrapped__
|
||||
|
||||
|
||||
@pytest.mark.p2
|
||||
def test_upload_document_skips_cross_kb_document_id_collision(monkeypatch):
|
||||
kb = SimpleNamespace(
|
||||
id="kb-target",
|
||||
tenant_id="tenant-1",
|
||||
name="Target KB",
|
||||
parser_id="default",
|
||||
pipeline_id=None,
|
||||
parser_config={},
|
||||
)
|
||||
existing_doc = SimpleNamespace(
|
||||
id="doc-1",
|
||||
kb_id="kb-other",
|
||||
location="old-location.txt",
|
||||
content_hash="old-hash",
|
||||
to_dict=lambda: {"id": "doc-1"},
|
||||
)
|
||||
|
||||
monkeypatch.setattr(FileService, "get_root_folder", classmethod(lambda cls, _uid: {"id": "root"}))
|
||||
monkeypatch.setattr(FileService, "init_knowledgebase_docs", classmethod(lambda cls, _pf_id, _uid: None))
|
||||
monkeypatch.setattr(FileService, "get_kb_folder", classmethod(lambda cls, _uid: {"id": "kb-root"}))
|
||||
monkeypatch.setattr(
|
||||
FileService,
|
||||
"new_a_file_from_kb",
|
||||
classmethod(lambda cls, _tenant_id, _name, _parent_id: {"id": "kb-folder"}),
|
||||
)
|
||||
monkeypatch.setattr(file_service_module.DocumentService, "get_by_id", lambda _doc_id: (True, existing_doc))
|
||||
|
||||
err, files = _unwrapped_upload_document()(
|
||||
FileService,
|
||||
kb,
|
||||
[_DummyUploadFile(filename="collision.txt", doc_id="doc-1")],
|
||||
"user-1",
|
||||
)
|
||||
|
||||
assert files == []
|
||||
assert len(err) == 1
|
||||
assert err[0].startswith("collision.txt: ")
|
||||
assert "Existing document id collision with another knowledge base; skipping update." in err[0]
|
||||
Reference in New Issue
Block a user