diff --git a/common/data_source/gitlab_connector.py b/common/data_source/gitlab_connector.py index 0d2c0dab77..dae24992b4 100644 --- a/common/data_source/gitlab_connector.py +++ b/common/data_source/gitlab_connector.py @@ -20,8 +20,11 @@ from common.data_source.interfaces import GenerateDocumentsOutput from common.data_source.interfaces import LoadConnector from common.data_source.interfaces import PollConnector from common.data_source.interfaces import SecondsSinceUnixEpoch +from common.data_source.interfaces import SlimConnectorWithPermSync from common.data_source.models import BasicExpertInfo from common.data_source.models import Document +from common.data_source.models import GenerateSlimDocumentOutput +from common.data_source.models import SlimDocument from common.data_source.utils import get_file_ext T = TypeVar("T") @@ -158,7 +161,7 @@ def _should_exclude(path: str) -> bool: return any(fnmatch.fnmatch(path, pattern) for pattern in exclude_patterns) -class GitlabConnector(LoadConnector, PollConnector): +class GitlabConnector(LoadConnector, PollConnector, SlimConnectorWithPermSync): def __init__( self, project_owner: str, @@ -313,6 +316,67 @@ class GitlabConnector(LoadConnector, PollConnector): end_datetime = datetime.fromtimestamp(end, tz=timezone.utc) return self._fetch_from_gitlab(start_datetime, end_datetime) + def retrieve_all_slim_docs_perm_sync(self, callback: Any = None) -> GenerateSlimDocumentOutput: + if self.gitlab_client is None: + raise ConnectorMissingCredentialError("Gitlab") + + project: Project = self.gitlab_client.projects.get( + f"{self.project_owner}/{self.project_name}" + ) + + slim_batch: list[SlimDocument] = [] + + def append_doc(doc_id: str): + slim_batch.append(SlimDocument(id=doc_id)) + if len(slim_batch) >= self.batch_size: + batch = slim_batch[:] + slim_batch.clear() + return batch + return None + + if self.include_code_files: + default_branch = project.default_branch + queue = deque([""]) + while queue: + current_path = queue.popleft() + files = project.repository_tree(path=current_path, all=True) + for file in files: + if _should_exclude(file["path"]): + continue + if file["type"] == "tree": + queue.append(file["path"]) + continue + if file["type"] != "blob": + continue + + file_url = f"{self.gitlab_client.url}/{self.project_owner}/{self.project_name}/-/blob/{default_branch}/{file['path']}" + batch = append_doc(file_url) + if batch: + yield batch + + if self.include_mrs: + merge_requests = project.mergerequests.list( + state=self.state_filter, + iterator=True, + ) + for mr in merge_requests: + batch = append_doc(mr.web_url) + if batch: + yield batch + + if self.include_issues: + issues = project.issues.list( + state=self.state_filter, + iterator=True, + ) + for issue in issues: + batch = append_doc(issue.web_url) + if batch: + yield batch + + if slim_batch: + yield slim_batch + if __name__ == "__main__": import os @@ -337,4 +401,4 @@ if __name__ == "__main__": document_batches = connector.load_from_state() for f in document_batches: print("Batch:", f) - print("Finished loading from state.") \ No newline at end of file + print("Finished loading from state.") diff --git a/rag/svr/sync_data_source.py b/rag/svr/sync_data_source.py index 6f160a3564..81ab42e7be 100644 --- a/rag/svr/sync_data_source.py +++ b/rag/svr/sync_data_source.py @@ -1305,6 +1305,7 @@ class Gitlab(SyncBase): } ) + file_list = None if task["reindex"] == "1" or not task["poll_range_start"]: document_generator = self.connector.load_from_state() _begin_info = "totally" @@ -1318,9 +1319,13 @@ class Gitlab(SyncBase): poll_start.timestamp(), datetime.now(timezone.utc).timestamp() ) + if self.conf.get("sync_deleted_files"): + file_list = [] + for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync(): + file_list.extend(slim_batch) _begin_info = "from {}".format(poll_start) self.log_connection("Gitlab", f"({self.conf['project_name']})", task) - return document_generator + return document_generator, file_list class Bitbucket(SyncBase): diff --git a/web/src/pages/user-setting/data-source/constant/index.tsx b/web/src/pages/user-setting/data-source/constant/index.tsx index 5816193e54..78e0f59906 100644 --- a/web/src/pages/user-setting/data-source/constant/index.tsx +++ b/web/src/pages/user-setting/data-source/constant/index.tsx @@ -58,6 +58,9 @@ export const DataSourceFeatureVisibilityMap = { [DataSourceKey.GITHUB]: { syncDeletedFiles: true, }, + [DataSourceKey.GITLAB]: { + syncDeletedFiles: true, + }, [DataSourceKey.GOOGLE_DRIVE]: { syncDeletedFiles: true, },