mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-06-29 15:31:05 +08:00
Feat: enable sync deleted files in gitlab (#14481)
### What problem does this PR solve? Feat: enable sync deleted files in gitlab ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@@ -20,8 +20,11 @@ from common.data_source.interfaces import GenerateDocumentsOutput
|
||||
from common.data_source.interfaces import LoadConnector
|
||||
from common.data_source.interfaces import PollConnector
|
||||
from common.data_source.interfaces import SecondsSinceUnixEpoch
|
||||
from common.data_source.interfaces import SlimConnectorWithPermSync
|
||||
from common.data_source.models import BasicExpertInfo
|
||||
from common.data_source.models import Document
|
||||
from common.data_source.models import GenerateSlimDocumentOutput
|
||||
from common.data_source.models import SlimDocument
|
||||
from common.data_source.utils import get_file_ext
|
||||
|
||||
T = TypeVar("T")
|
||||
@@ -158,7 +161,7 @@ def _should_exclude(path: str) -> bool:
|
||||
return any(fnmatch.fnmatch(path, pattern) for pattern in exclude_patterns)
|
||||
|
||||
|
||||
class GitlabConnector(LoadConnector, PollConnector):
|
||||
class GitlabConnector(LoadConnector, PollConnector, SlimConnectorWithPermSync):
|
||||
def __init__(
|
||||
self,
|
||||
project_owner: str,
|
||||
@@ -313,6 +316,67 @@ class GitlabConnector(LoadConnector, PollConnector):
|
||||
end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)
|
||||
return self._fetch_from_gitlab(start_datetime, end_datetime)
|
||||
|
||||
def retrieve_all_slim_docs_perm_sync(self, callback: Any = None) -> GenerateSlimDocumentOutput:
|
||||
if self.gitlab_client is None:
|
||||
raise ConnectorMissingCredentialError("Gitlab")
|
||||
|
||||
project: Project = self.gitlab_client.projects.get(
|
||||
f"{self.project_owner}/{self.project_name}"
|
||||
)
|
||||
|
||||
slim_batch: list[SlimDocument] = []
|
||||
|
||||
def append_doc(doc_id: str):
|
||||
slim_batch.append(SlimDocument(id=doc_id))
|
||||
if len(slim_batch) >= self.batch_size:
|
||||
batch = slim_batch[:]
|
||||
slim_batch.clear()
|
||||
return batch
|
||||
return None
|
||||
|
||||
if self.include_code_files:
|
||||
default_branch = project.default_branch
|
||||
queue = deque([""])
|
||||
while queue:
|
||||
current_path = queue.popleft()
|
||||
files = project.repository_tree(path=current_path, all=True)
|
||||
for file in files:
|
||||
if _should_exclude(file["path"]):
|
||||
continue
|
||||
if file["type"] == "tree":
|
||||
queue.append(file["path"])
|
||||
continue
|
||||
if file["type"] != "blob":
|
||||
continue
|
||||
|
||||
file_url = f"{self.gitlab_client.url}/{self.project_owner}/{self.project_name}/-/blob/{default_branch}/{file['path']}"
|
||||
batch = append_doc(file_url)
|
||||
if batch:
|
||||
yield batch
|
||||
|
||||
if self.include_mrs:
|
||||
merge_requests = project.mergerequests.list(
|
||||
state=self.state_filter,
|
||||
iterator=True,
|
||||
)
|
||||
for mr in merge_requests:
|
||||
batch = append_doc(mr.web_url)
|
||||
if batch:
|
||||
yield batch
|
||||
|
||||
if self.include_issues:
|
||||
issues = project.issues.list(
|
||||
state=self.state_filter,
|
||||
iterator=True,
|
||||
)
|
||||
for issue in issues:
|
||||
batch = append_doc(issue.web_url)
|
||||
if batch:
|
||||
yield batch
|
||||
|
||||
if slim_batch:
|
||||
yield slim_batch
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import os
|
||||
@@ -337,4 +401,4 @@ if __name__ == "__main__":
|
||||
document_batches = connector.load_from_state()
|
||||
for f in document_batches:
|
||||
print("Batch:", f)
|
||||
print("Finished loading from state.")
|
||||
print("Finished loading from state.")
|
||||
|
||||
@@ -1305,6 +1305,7 @@ class Gitlab(SyncBase):
|
||||
}
|
||||
)
|
||||
|
||||
file_list = None
|
||||
if task["reindex"] == "1" or not task["poll_range_start"]:
|
||||
document_generator = self.connector.load_from_state()
|
||||
_begin_info = "totally"
|
||||
@@ -1318,9 +1319,13 @@ class Gitlab(SyncBase):
|
||||
poll_start.timestamp(),
|
||||
datetime.now(timezone.utc).timestamp()
|
||||
)
|
||||
if self.conf.get("sync_deleted_files"):
|
||||
file_list = []
|
||||
for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync():
|
||||
file_list.extend(slim_batch)
|
||||
_begin_info = "from {}".format(poll_start)
|
||||
self.log_connection("Gitlab", f"({self.conf['project_name']})", task)
|
||||
return document_generator
|
||||
return document_generator, file_list
|
||||
|
||||
|
||||
class Bitbucket(SyncBase):
|
||||
|
||||
@@ -58,6 +58,9 @@ export const DataSourceFeatureVisibilityMap = {
|
||||
[DataSourceKey.GITHUB]: {
|
||||
syncDeletedFiles: true,
|
||||
},
|
||||
[DataSourceKey.GITLAB]: {
|
||||
syncDeletedFiles: true,
|
||||
},
|
||||
[DataSourceKey.GOOGLE_DRIVE]: {
|
||||
syncDeletedFiles: true,
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user