mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 23:41:12 +08:00
Feat: enable sync deleted files for connector (#14000)
### What problem does this PR solve? Feat: enable sync deleted files for connector 1. first comes with github ### Type of change - [x] New Feature (non-breaking change which adds functionality) <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **New Features** * Added "sync deleted files" feature for data sources, enabling automatic removal of files deleted from the source system. * Added multilingual support for the new sync deleted files setting across multiple languages. * **UI Improvements** * Improved checkbox form field rendering and layout. * Enhanced full-width display for authentication token input fields.
This commit is contained in:
@@ -28,14 +28,20 @@ from common.data_source.exceptions import (
|
||||
InsufficientPermissionsError,
|
||||
UnexpectedValidationError,
|
||||
)
|
||||
from common.data_source.interfaces import CheckpointedConnectorWithPermSyncGH, CheckpointOutput
|
||||
from common.data_source.interfaces import (
|
||||
CheckpointedConnectorWithPermSyncGH,
|
||||
CheckpointOutput,
|
||||
CheckpointOutputWrapper,
|
||||
)
|
||||
from common.data_source.models import (
|
||||
ConnectorCheckpoint,
|
||||
ConnectorFailure,
|
||||
Document,
|
||||
DocumentFailure,
|
||||
ExternalAccess,
|
||||
GenerateSlimDocumentOutput,
|
||||
SecondsSinceUnixEpoch,
|
||||
SlimDocument,
|
||||
)
|
||||
from common.data_source.connector_runner import ConnectorRunner
|
||||
from .models import SerializedRepository
|
||||
@@ -594,14 +600,8 @@ class GithubConnector(CheckpointedConnectorWithPermSyncGH[GithubConnectorCheckpo
|
||||
done_with_prs = False
|
||||
num_prs = 0
|
||||
pr = None
|
||||
print("start: ", start)
|
||||
for pr in pr_batch:
|
||||
num_prs += 1
|
||||
print("-"*40)
|
||||
print("PR name", pr.title)
|
||||
print("updated at", pr.updated_at)
|
||||
print("-"*40)
|
||||
print("\n")
|
||||
# we iterate backwards in time, so at this point we stop processing prs
|
||||
if (
|
||||
start is not None
|
||||
@@ -732,10 +732,10 @@ class GithubConnector(CheckpointedConnectorWithPermSyncGH[GithubConnectorCheckpo
|
||||
|
||||
if checkpoint.cached_repo_ids:
|
||||
logging.info(
|
||||
f"{len(checkpoint.cached_repo_ids)} repos remaining (IDs: {checkpoint.cached_repo_ids})"
|
||||
f"{len(checkpoint.cached_repo_ids)} checkpoint repos remaining (IDs: {checkpoint.cached_repo_ids})"
|
||||
)
|
||||
else:
|
||||
logging.info("No more repos remaining")
|
||||
logging.info("There are no more checkpoint repos left.")
|
||||
|
||||
return checkpoint
|
||||
|
||||
@@ -923,6 +923,53 @@ class GithubConnector(CheckpointedConnectorWithPermSyncGH[GithubConnectorCheckpo
|
||||
) -> GithubConnectorCheckpoint:
|
||||
return GithubConnectorCheckpoint.model_validate_json(checkpoint_json)
|
||||
|
||||
def retrieve_slim_document(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
callback: Any = None,
|
||||
) -> GenerateSlimDocumentOutput:
|
||||
start_value = 0.0 if start is None else start
|
||||
end_value = (
|
||||
datetime.now(timezone.utc).timestamp() if end is None else end
|
||||
)
|
||||
checkpoint = self.build_dummy_checkpoint()
|
||||
slim_batch: list[SlimDocument] = []
|
||||
|
||||
while checkpoint.has_more:
|
||||
wrapper = CheckpointOutputWrapper[GithubConnectorCheckpoint]()
|
||||
for document, failure, next_checkpoint in wrapper(
|
||||
self.load_from_checkpoint(start_value, end_value, checkpoint)
|
||||
):
|
||||
if failure is not None:
|
||||
logging.warning(
|
||||
"GitHub connector failure during slim retrieval: %s",
|
||||
getattr(failure, "failure_message", failure),
|
||||
)
|
||||
continue
|
||||
|
||||
if document is not None:
|
||||
slim_batch.append(SlimDocument(id=document.id))
|
||||
if len(slim_batch) >= SLIM_BATCH_SIZE:
|
||||
yield slim_batch
|
||||
slim_batch = []
|
||||
if callback:
|
||||
callback.progress("github_slim_document", 1)
|
||||
|
||||
if next_checkpoint is not None:
|
||||
checkpoint = next_checkpoint
|
||||
|
||||
if slim_batch:
|
||||
yield slim_batch
|
||||
|
||||
def retrieve_all_slim_docs_perm_sync(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch | None = None,
|
||||
end: SecondsSinceUnixEpoch | None = None,
|
||||
callback: Any = None,
|
||||
) -> GenerateSlimDocumentOutput:
|
||||
yield from self.retrieve_slim_document(start=start, end=end, callback=callback)
|
||||
|
||||
def build_dummy_checkpoint(self) -> GithubConnectorCheckpoint:
|
||||
return GithubConnectorCheckpoint(
|
||||
stage=GithubConnectorStage.PRS, curr_page=0, has_more=True, num_retrieved=0
|
||||
@@ -970,4 +1017,4 @@ if __name__ == "__main__":
|
||||
if failure:
|
||||
print(f"Failure: {failure.failure_message}")
|
||||
if next_checkpoint:
|
||||
checkpoint = next_checkpoint
|
||||
checkpoint = next_checkpoint
|
||||
|
||||
Reference in New Issue
Block a user