diff --git a/api/db/services/connector_service.py b/api/db/services/connector_service.py index 10d04c79b1..85d495d9d6 100644 --- a/api/db/services/connector_service.py +++ b/api/db/services/connector_service.py @@ -26,6 +26,7 @@ from api.db.db_models import Connector, SyncLogs, Connector2Kb, Knowledgebase from api.db.services.common_service import CommonService from api.db.services.document_service import DocumentService from api.db.services.document_service import DocMetadataService +from api.utils.common import hash128 from common.misc_utils import get_uuid from common.constants import TaskStatus from common.time_utils import current_timestamp, timestamp_to_date @@ -78,6 +79,64 @@ class ConnectorService(CommonService): SyncLogsService.schedule(connector_id, kb_id, reindex=True) return err + @classmethod + def cleanup_stale_documents_for_task( + cls, + task_id: str, + connector_id: str, + kb_id: str, + tenant_id: str, + file_list, + delete_batch_size: int = 100, + ): + from api.db.services.file_service import FileService + + if not Connector2KbService.query(connector_id=connector_id, kb_id=kb_id): + return 0, [] + + e, conn = cls.get_by_id(connector_id) + if not e: + return 0, [] + + source_type = f"{conn.source}/{conn.id}" + retain_doc_ids = {hash128(file.id) for file in file_list} + existing_docs = DocumentService.list_doc_headers_by_kb_and_source_type( + kb_id, + source_type, + ) + stale_doc_ids = [ + doc["id"] for doc in existing_docs if doc["id"] not in retain_doc_ids + ] + if not stale_doc_ids: + return 0, [] + + stale_doc_id_set = set(stale_doc_ids) + errors = [] + for offset in range(0, len(stale_doc_ids), delete_batch_size): + err = FileService.delete_docs( + stale_doc_ids[offset : offset + delete_batch_size], + tenant_id, + ) + if err: + errors.append(err) + + remaining_doc_ids = { + doc["id"] + for doc in DocumentService.list_doc_headers_by_kb_and_source_type( + kb_id, + source_type, + ) + if doc["id"] in stale_doc_id_set + } + removed_count = len(stale_doc_id_set) - len(remaining_doc_ids) + SyncLogsService.increase_removed_docs( + task_id, + removed_count, + "\n".join(errors), + len(errors), + ) + return removed_count, errors + class SyncLogsService(CommonService): model = SyncLogs @@ -196,6 +255,16 @@ class SyncLogsService(CommonService): )\ .where(cls.model.id == id).execute() + @classmethod + def increase_removed_docs(cls, id, removed_count, err_msg="", error_count=0): + cls.model.update( + docs_removed_from_index=cls.model.docs_removed_from_index + removed_count, + error_msg=cls.model.error_msg + err_msg, + error_count=cls.model.error_count + error_count, + update_time=current_timestamp(), + update_date=timestamp_to_date(current_timestamp()), + ).where(cls.model.id == id).execute() + @classmethod def duplicate_and_parse(cls, kb, docs, tenant_id, src, auto_parse=True): from api.db.services.file_service import FileService @@ -300,5 +369,3 @@ class Connector2KbService(CommonService): ).dicts() ) - - diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index 4782bf85de..c31d415189 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -373,6 +373,25 @@ class DocumentService(CommonService): offset += limit return res + @classmethod + @DB.connection_context() + def list_doc_headers_by_kb_and_source_type(cls, kb_id, source_type, page_size=500): + fields = [cls.model.id, cls.model.kb_id, cls.model.source_type, cls.model.name] + docs = cls.model.select(*fields).where( + cls.model.kb_id == kb_id, + cls.model.source_type == source_type, + ).order_by(cls.model.create_time.asc()) + offset = 0 + res = [] + while True: + doc_batch = docs.offset(offset).limit(page_size) + _temp = list(doc_batch.dicts()) + if not _temp: + break + res.extend(_temp) + offset += page_size + return res + @classmethod @DB.connection_context() def get_all_docs_by_creator_id(cls, creator_id): diff --git a/common/data_source/github/connector.py b/common/data_source/github/connector.py index 6a9b96740b..258e2cf8b4 100644 --- a/common/data_source/github/connector.py +++ b/common/data_source/github/connector.py @@ -28,14 +28,20 @@ from common.data_source.exceptions import ( InsufficientPermissionsError, UnexpectedValidationError, ) -from common.data_source.interfaces import CheckpointedConnectorWithPermSyncGH, CheckpointOutput +from common.data_source.interfaces import ( + CheckpointedConnectorWithPermSyncGH, + CheckpointOutput, + CheckpointOutputWrapper, +) from common.data_source.models import ( ConnectorCheckpoint, ConnectorFailure, Document, DocumentFailure, ExternalAccess, + GenerateSlimDocumentOutput, SecondsSinceUnixEpoch, + SlimDocument, ) from common.data_source.connector_runner import ConnectorRunner from .models import SerializedRepository @@ -594,14 +600,8 @@ class GithubConnector(CheckpointedConnectorWithPermSyncGH[GithubConnectorCheckpo done_with_prs = False num_prs = 0 pr = None - print("start: ", start) for pr in pr_batch: num_prs += 1 - print("-"*40) - print("PR name", pr.title) - print("updated at", pr.updated_at) - print("-"*40) - print("\n") # we iterate backwards in time, so at this point we stop processing prs if ( start is not None @@ -732,10 +732,10 @@ class GithubConnector(CheckpointedConnectorWithPermSyncGH[GithubConnectorCheckpo if checkpoint.cached_repo_ids: logging.info( - f"{len(checkpoint.cached_repo_ids)} repos remaining (IDs: {checkpoint.cached_repo_ids})" + f"{len(checkpoint.cached_repo_ids)} checkpoint repos remaining (IDs: {checkpoint.cached_repo_ids})" ) else: - logging.info("No more repos remaining") + logging.info("There are no more checkpoint repos left.") return checkpoint @@ -923,6 +923,53 @@ class GithubConnector(CheckpointedConnectorWithPermSyncGH[GithubConnectorCheckpo ) -> GithubConnectorCheckpoint: return GithubConnectorCheckpoint.model_validate_json(checkpoint_json) + def retrieve_slim_document( + self, + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, + callback: Any = None, + ) -> GenerateSlimDocumentOutput: + start_value = 0.0 if start is None else start + end_value = ( + datetime.now(timezone.utc).timestamp() if end is None else end + ) + checkpoint = self.build_dummy_checkpoint() + slim_batch: list[SlimDocument] = [] + + while checkpoint.has_more: + wrapper = CheckpointOutputWrapper[GithubConnectorCheckpoint]() + for document, failure, next_checkpoint in wrapper( + self.load_from_checkpoint(start_value, end_value, checkpoint) + ): + if failure is not None: + logging.warning( + "GitHub connector failure during slim retrieval: %s", + getattr(failure, "failure_message", failure), + ) + continue + + if document is not None: + slim_batch.append(SlimDocument(id=document.id)) + if len(slim_batch) >= SLIM_BATCH_SIZE: + yield slim_batch + slim_batch = [] + if callback: + callback.progress("github_slim_document", 1) + + if next_checkpoint is not None: + checkpoint = next_checkpoint + + if slim_batch: + yield slim_batch + + def retrieve_all_slim_docs_perm_sync( + self, + start: SecondsSinceUnixEpoch | None = None, + end: SecondsSinceUnixEpoch | None = None, + callback: Any = None, + ) -> GenerateSlimDocumentOutput: + yield from self.retrieve_slim_document(start=start, end=end, callback=callback) + def build_dummy_checkpoint(self) -> GithubConnectorCheckpoint: return GithubConnectorCheckpoint( stage=GithubConnectorStage.PRS, curr_page=0, has_more=True, num_retrieved=0 @@ -970,4 +1017,4 @@ if __name__ == "__main__": if failure: print(f"Failure: {failure.failure_message}") if next_checkpoint: - checkpoint = next_checkpoint \ No newline at end of file + checkpoint = next_checkpoint diff --git a/rag/svr/sync_data_source.py b/rag/svr/sync_data_source.py index 4b60780190..697940d714 100644 --- a/rag/svr/sync_data_source.py +++ b/rag/svr/sync_data_source.py @@ -20,7 +20,7 @@ import time -start_ts = time.time() +start_ts = time.perf_counter() import asyncio import copy @@ -38,6 +38,7 @@ from flask import json from api.utils.common import hash128 from api.db.services.connector_service import ConnectorService, SyncLogsService +from api.db.services.document_service import DocumentService from api.db.services.knowledgebase_service import KnowledgebaseService from common import settings from common.config_utils import show_configs @@ -84,6 +85,38 @@ class SyncBase: def __init__(self, conf: dict) -> None: self.conf = conf + @staticmethod + def _format_window_boundary(value: datetime | None) -> str: + if value is None: + return "beginning" + return value.astimezone().strftime("%Y-%m-%d %H:%M:%S %Z") + + @classmethod + def window_info(cls, task: dict) -> str: + window_start = None + if task.get("reindex") != "1" and task.get("poll_range_start"): + window_start = task["poll_range_start"] + window_end = datetime.now(timezone.utc) + return ( + f"sync window: {cls._format_window_boundary(window_start)}" + f" -> {cls._format_window_boundary(window_end)}" + ) + + @classmethod + def log_connection( + cls, + name: str, + details: str, + task: dict, + extra: str = "", + ): + if task.get("skip_connection_log"): + return + if extra: + logging.info("Connect to %s: %s, %s, %s", name, details, cls.window_info(task), extra) + return + logging.info("Connect to %s: %s, %s", name, details, cls.window_info(task)) + async def __call__(self, task: dict): SyncLogsService.start(task["id"], task["connector_id"]) @@ -111,11 +144,29 @@ class SyncBase: SyncLogsService.schedule(task["connector_id"], task["kb_id"], task["poll_range_start"]) async def _run_task_logic(self, task: dict): - document_batch_generator = await self._generate(task) + generate_output = await self._generate(task) + # `_generate()` currently supports two outputs: + # 1. `document_batch_generator` + # 2. `(document_batch_generator, file_list)` + if isinstance(generate_output, tuple): + document_batch_generator, file_list = generate_output + else: + document_batch_generator = generate_output + file_list = None - doc_num = 0 failed_docs = 0 + added_docs = 0 + updated_docs = 0 + removed_docs = 0 next_update = datetime(1970, 1, 1, tzinfo=timezone.utc) + source_type = f"{self.SOURCE_NAME}/{task['connector_id']}" + existing_doc_ids = { + doc["id"] + for doc in DocumentService.list_doc_headers_by_kb_and_source_type( + task["kb_id"], + source_type, + ) + } if task["poll_range_start"]: next_update = task["poll_range_start"] @@ -154,8 +205,12 @@ class SyncBase: task["id"], max_update, len(docs), "\n".join(err), len(err) ) - - doc_num += len(docs) + changed_doc_ids = set(dids) + updated_in_batch = len(changed_doc_ids & existing_doc_ids) + added_in_batch = len(changed_doc_ids) - updated_in_batch + added_docs += added_in_batch + updated_docs += updated_in_batch + existing_doc_ids.update(changed_doc_ids) except Exception as batch_ex: msg = str(batch_ex) @@ -170,10 +225,26 @@ class SyncBase: continue prefix = self._get_source_prefix() + prefix = f"{prefix} " if prefix else "" + next_update_info = self._format_window_boundary(next_update) + if file_list is not None: + removed_docs, _ = ConnectorService.cleanup_stale_documents_for_task( + task["id"], + task["connector_id"], + task["kb_id"], + task["tenant_id"], + file_list, + ) + + total_changed_docs = added_docs + updated_docs + removed_docs + summary = ( + f"{prefix}sync summary till {next_update_info}: " + f"total={total_changed_docs}, added={added_docs}, " + f"updated={updated_docs}, deleted={removed_docs}" + ) if failed_docs > 0: - logging.info(f"{prefix}{doc_num} docs synchronized till {next_update} ({failed_docs} skipped)") - else: - logging.info(f"{prefix}{doc_num} docs synchronized till {next_update}") + summary = f"{summary}, skipped={failed_docs}" + logging.info(summary) SyncLogsService.done(task["id"], task["connector_id"]) task["poll_range_start"] = next_update @@ -354,7 +425,7 @@ class Confluence(SyncBase): for batch in document_batches(): yield batch - logging.info("Connect to Confluence: {} {}".format(self.conf["wiki_base"], begin_info)) + self.log_connection("Confluence", self.conf["wiki_base"], task) return wrapper() @@ -373,7 +444,7 @@ class Notion(SyncBase): begin_info = "totally" if task["reindex"] == "1" or not task["poll_range_start"] else "from {}".format( task["poll_range_start"]) - logging.info("Connect to Notion: root({}) {}".format(self.conf["root_page_id"], begin_info)) + self.log_connection("Notion", f"root({self.conf['root_page_id']})", task) return document_generator @@ -401,7 +472,7 @@ class Discord(SyncBase): begin_info = "totally" if task["reindex"] == "1" or not task["poll_range_start"] else "from {}".format( task["poll_range_start"]) - logging.info("Connect to Discord: servers({}), channel({}) {}".format(server_ids, channel_names, begin_info)) + self.log_connection("Discord", f"servers({server_ids}), channel({channel_names})", task) return document_generator @@ -465,7 +536,7 @@ class Gmail(SyncBase): admin_email = self.connector.primary_admin_email except RuntimeError: admin_email = "unknown" - logging.info(f"Connect to Gmail as {admin_email} {begin_info}") + self.log_connection("Gmail", f"as {admin_email}", task) return document_generator @@ -486,7 +557,7 @@ class Dropbox(SyncBase): ) begin_info = f"from {poll_start}" - logging.info(f"[Dropbox] Connect to Dropbox {begin_info}") + self.log_connection("Dropbox", "workspace", task) return document_generator @@ -564,7 +635,7 @@ class GoogleDrive(SyncBase): admin_email = self.connector.primary_admin_email except RuntimeError: admin_email = "unknown" - logging.info(f"Connect to Google Drive as {admin_email} {begin_info}") + self.log_connection("Google Drive", f"as {admin_email}", task) return document_batches() def _persist_rotated_credentials(self, connector_id: str, credentials: dict[str, Any]) -> None: @@ -663,14 +734,14 @@ class Jira(SyncBase): if pending_docs: yield pending_docs - logging.info( - "[Jira] Connect to Jira %s %s (start=%s, end=%s, sync_batch_size=%s, overlap_buffer_s=%s)", + self.log_connection( + "Jira", connector_kwargs["jira_base_url"], - begin_info, - start_time, - end_time, - batch_size, - getattr(self.connector, "time_buffer_seconds", connector_kwargs.get("time_buffer_seconds")), + task, + ( + f"sync_batch_size={batch_size}, " + f"overlap_buffer_s={getattr(self.connector, 'time_buffer_seconds', connector_kwargs.get('time_buffer_seconds'))}" + ), ) return document_batches() @@ -715,24 +786,16 @@ class WebDAV(SyncBase): self.connector.set_allow_images(self.conf.get("allow_images", False)) self.connector.load_credentials(self.conf["credentials"]) - logging.info(f"Task info: reindex={task['reindex']}, poll_range_start={task['poll_range_start']}") - if task["reindex"] == "1" or not task["poll_range_start"]: - logging.info("Using load_from_state (full sync)") document_batch_generator = self.connector.load_from_state() begin_info = "totally" else: start_ts = task["poll_range_start"].timestamp() end_ts = datetime.now(timezone.utc).timestamp() - logging.info(f"Polling WebDAV from {task['poll_range_start']} (ts: {start_ts}) to now (ts: {end_ts})") document_batch_generator = self.connector.poll_source(start_ts, end_ts) begin_info = "from {}".format(task["poll_range_start"]) - logging.info("Connect to WebDAV: {}(path: {}) {}".format( - self.conf["base_url"], - self.conf.get("remote_path", "/"), - begin_info - )) + self.log_connection("WebDAV", f"{self.conf['base_url']}(path: {self.conf.get('remote_path', '/')})", task) def wrapper(): for document_batch in document_batch_generator: @@ -765,7 +828,7 @@ class Moodle(SyncBase): ) begin_info = f"from {poll_start}" - logging.info("Connect to Moodle: {} {}".format(self.conf["moodle_url"], begin_info)) + self.log_connection("Moodle", self.conf["moodle_url"], task) return document_generator @@ -804,7 +867,7 @@ class BOX(SyncBase): datetime.now(timezone.utc).timestamp(), ) begin_info = f"from {poll_start}" - logging.info("Connect to Box: folder_id({}) {}".format(self.conf["folder_id"], begin_info)) + self.log_connection("Box", f"folder_id({self.conf['folder_id']})", task) return document_generator @@ -841,11 +904,10 @@ class Airtable(SyncBase): ) begin_info = f"from {poll_start}" - logging.info( - "Connect to Airtable: base_id(%s), table(%s) %s", - self.conf.get("base_id"), - self.conf.get("table_name_or_id"), - begin_info, + self.log_connection( + "Airtable", + f"base_id({self.conf.get('base_id')}), table({self.conf.get('table_name_or_id')})", + task, ) return document_generator @@ -882,12 +944,10 @@ class Asana(SyncBase): ) begin_info = f"from {poll_start}" - logging.info( - "Connect to Asana: workspace_id(%s), project_ids(%s), team_id(%s) %s", - self.conf.get("asana_workspace_id"), - self.conf.get("asana_project_ids"), - self.conf.get("asana_team_id"), - begin_info, + self.log_connection( + "Asana", + f"workspace_id({self.conf.get('asana_workspace_id')}), project_ids({self.conf.get('asana_project_ids')}), team_id({self.conf.get('asana_team_id')})", + task, ) return document_generator @@ -916,12 +976,17 @@ class Github(SyncBase): {"github_access_token": credentials["github_access_token"]} ) + file_list = None if task.get("reindex") == "1" or not task.get("poll_range_start"): start_time = datetime.fromtimestamp(0, tz=timezone.utc) begin_info = "totally" else: start_time = task.get("poll_range_start") begin_info = f"from {start_time}" + if self.conf.get("sync_deleted_files"): + file_list = [] + for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync(): + file_list.extend(slim_batch) end_time = datetime.now(timezone.utc) @@ -952,14 +1017,13 @@ class Github(SyncBase): for batch in document_batches(): yield batch - logging.info( - "Connect to Github: org_name(%s), repo_names(%s) for %s", - self.conf.get("repository_owner"), - self.conf.get("repository_name"), - begin_info, + self.log_connection( + "Github", + f"org_name({self.conf.get('repository_owner')}), repo_names({self.conf.get('repository_name')})", + task, ) - return wrapper() + return wrapper(), file_list class IMAP(SyncBase): SOURCE_NAME: str = FileSource.IMAP @@ -1020,13 +1084,10 @@ class IMAP(SyncBase): for batch in document_batches(): yield batch - logging.info( - "Connect to IMAP: host(%s) port(%s) user(%s) folder(%s) %s", - self.conf["imap_host"], - self.conf["imap_port"], - self.conf["credentials"]["imap_username"], - self.conf["imap_mailbox"], - begin_info + self.log_connection( + "IMAP", + f"host({self.conf['imap_host']}) port({self.conf['imap_port']}) user({self.conf['credentials']['imap_username']}) folder({self.conf['imap_mailbox']})", + task, ) return wrapper() @@ -1102,11 +1163,7 @@ class Zendesk(SyncBase): for batch in document_batches(): yield batch - logging.info( - "Connect to Zendesk: subdomain(%s) %s", - self.conf['credentials'].get("zendesk_subdomain"), - begin_info, - ) + self.log_connection("Zendesk", f"subdomain({self.conf['credentials'].get('zendesk_subdomain')})", task) return wrapper() @@ -1148,7 +1205,7 @@ class Gitlab(SyncBase): datetime.now(timezone.utc).timestamp() ) begin_info = "from {}".format(poll_start) - logging.info("Connect to Gitlab: ({}) {}".format(self.conf["project_name"], begin_info)) + self.log_connection("Gitlab", f"({self.conf['project_name']})", task) return document_generator @@ -1204,11 +1261,7 @@ class Bitbucket(SyncBase): for batch in document_batches(): yield batch - logging.info( - "Connect to Bitbucket: workspace(%s), %s", - self.conf.get("workspace"), - begin_info, - ) + self.log_connection("Bitbucket", f"workspace({self.conf.get('workspace')})", task) return wrapper() @@ -1246,10 +1299,7 @@ class SeaFile(SyncBase): if scope == "directory": extra += f" path={conf.get('sync_path')}" - logging.info( - "Connect to SeaFile: %s (scope=%s%s) %s", - conf["seafile_url"], scope, extra, begin_info, - ) + self.log_connection("SeaFile", f"{conf['seafile_url']} (scope={scope}{extra})", task) return document_generator @@ -1286,11 +1336,10 @@ class DingTalkAITable(SyncBase): ) begin_info = f"from {poll_start}" - logging.info( - "Connect to DingTalk AI Table: table_id(%s), operator_id(%s) %s", - self.conf.get("table_id"), - self.conf.get("operator_id"), - begin_info, + self.log_connection( + "DingTalk AI Table", + f"table_id({self.conf.get('table_id')}), operator_id({self.conf.get('operator_id')})", + task, ) return document_generator @@ -1331,7 +1380,7 @@ class MySQL(SyncBase): ) begin_info = f"from {poll_start}" - logging.info(f"[MySQL] Connect to {self.conf.get('host')}:{self.conf.get('database')} {begin_info}") + self.log_connection("MySQL", f"{self.conf.get('host')}:{self.conf.get('database')}", task) return document_generator @@ -1370,7 +1419,7 @@ class PostgreSQL(SyncBase): ) begin_info = f"from {poll_start}" - logging.info(f"[PostgreSQL] Connect to {self.conf.get('host')}:{self.conf.get('database')} {begin_info}") + self.log_connection("PostgreSQL", f"{self.conf.get('host')}:{self.conf.get('database')}", task) return document_generator @@ -1470,7 +1519,7 @@ async def main(): signal.signal(signal.SIGINT, signal_handler) signal.signal(signal.SIGTERM, signal_handler) - logging.info(f"RAGFlow data sync is ready after {time.time() - start_ts}s initialization.") + logging.info(f"RAGFlow data sync is ready after {time.perf_counter() - start_ts}s initialization.") while not stop_event.is_set(): await dispatch_tasks() logging.error("BUG!!! You should not reach here!!!") diff --git a/web/src/components/dynamic-form.tsx b/web/src/components/dynamic-form.tsx index 864eefbd02..5c9fff5eaf 100644 --- a/web/src/components/dynamic-form.tsx +++ b/web/src/components/dynamic-form.tsx @@ -22,14 +22,7 @@ import EditTag from '@/components/edit-tag'; import { SelectWithSearch } from '@/components/originui/select-with-search'; import { RAGFlowFormItem } from '@/components/ragflow-form'; import { Checkbox } from '@/components/ui/checkbox'; -import { - Form, - FormControl, - FormField, - FormItem, - FormLabel, - FormMessage, -} from '@/components/ui/form'; +import { Form } from '@/components/ui/form'; import { Input } from '@/components/ui/input'; import { Textarea } from '@/components/ui/textarea'; import { cn } from '@/lib/utils'; @@ -374,7 +367,9 @@ export const RenderField = ({ }, } : fieldProps; - return field.render?.(finalFieldProps); + return ( +
{field.render?.(finalFieldProps)}
+ ); }} ); @@ -503,64 +498,38 @@ export const RenderField = ({ case FormFieldType.Checkbox: return ( - ( - - {field.label && !field.horizontal && ( -
- - {field.label}{' '} - {field.required && ( - * - )} - -
- )} - {field.label && field.horizontal && ( -
- - {field.label}{' '} - {field.required && ( - * - )} - -
- )} - -
- { - formField.onChange(checked); - field.onChange?.(checked); - }} - disabled={field.disabled} - /> -
-
- - -
- )} - /> + + {(fieldProps) => { + const finalFieldProps = field.onChange + ? { + ...fieldProps, + onChange: (checked: boolean) => { + fieldProps.onChange(checked); + field.onChange?.(checked); + }, + } + : fieldProps; + return ( +
+ + finalFieldProps.onChange(Boolean(checked)) + } + disabled={field.disabled} + /> +
+ ); + }} +
); case FormFieldType.Switch: return ( diff --git a/web/src/locales/ar.ts b/web/src/locales/ar.ts index 5ad61a7b82..9680863aba 100644 --- a/web/src/locales/ar.ts +++ b/web/src/locales/ar.ts @@ -857,6 +857,7 @@ export default { selectModelPlaceholder: 'حدد النموذج', configureModelTitle: 'تكوين النموذج', connectorNameTip: 'اسم وصفي للموصل', + syncDeletedFiles: 'مزامنة الملفات المحذوفة', confluenceIsCloudTip: 'تحقق مما إذا كان هذا هو مثيل Confluence Cloud، قم بإلغاء تحديد Confluence Server/Data Center', confluenceWikiBaseUrlTip: diff --git a/web/src/locales/bg.ts b/web/src/locales/bg.ts index b976eeeb44..c3a9f86982 100644 --- a/web/src/locales/bg.ts +++ b/web/src/locales/bg.ts @@ -925,6 +925,7 @@ The above is the content you need to summarize.`, selectModelPlaceholder: 'Изберете модел', configureModelTitle: 'Конфигуриране на модел', connectorNameTip: 'Описателно име за конектора', + syncDeletedFiles: 'Синхронизирай изтритите файлове', confluenceIsCloudTip: 'Отметнете, ако това е Confluence Cloud инстанция, махнете за Confluence Server/Data Center', confluenceWikiBaseUrlTip: diff --git a/web/src/locales/de.ts b/web/src/locales/de.ts index 3770f18629..256d21869a 100644 --- a/web/src/locales/de.ts +++ b/web/src/locales/de.ts @@ -1011,6 +1011,7 @@ Beispiel: Virtual Hosted Style`, 'Kommagetrennte Repository-Slugs, z. B.: repo-one,repo-two', connectorNameTip: 'Geben Sie einen aussagekräftigen Namen für den Connector an', + syncDeletedFiles: 'Gelöschte Dateien synchronisieren', boxDescription: 'Verbinden Sie Ihr Box-Laufwerk, um Dateien und Ordner zu synchronisieren.', githubDescription: diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts index f720a54f59..6582078d6a 100644 --- a/web/src/locales/en.ts +++ b/web/src/locales/en.ts @@ -979,6 +979,7 @@ This auto-tagging feature enhances retrieval by adding another layer of domain-s selectModelPlaceholder: 'Select model', configureModelTitle: 'Configure model', connectorNameTip: 'A descriptive name for the connector', + syncDeletedFiles: 'Sync deleted files', confluenceIsCloudTip: 'Check if this is a Confluence Cloud instance, uncheck for Confluence Server/Data Center', confluenceWikiBaseUrlTip: diff --git a/web/src/locales/ru.ts b/web/src/locales/ru.ts index e460204d6c..9ed1af9f9a 100644 --- a/web/src/locales/ru.ts +++ b/web/src/locales/ru.ts @@ -121,8 +121,7 @@ export default { 'Анализирует содержание диалога, извлекает ключевую информацию и формирует структурированные сводки памяти.', embeddingModelTooltip: 'Преобразует текст в числовые векторы для семантического поиска и извлечения памяти.', - embeddingModelError: - 'Тип памяти обязателен; тип «raw» нельзя удалить.', + embeddingModelError: 'Тип памяти обязателен; тип «raw» нельзя удалить.', memoryTypeTooltip: `Raw: исходный диалог пользователя и агента (обязателен по умолчанию). Семантическая память: обобщённые знания и факты о пользователе и мире. Эпизодическая память: события с привязкой ко времени. @@ -235,7 +234,8 @@ export default { 'Имя поля уже есть. Подтвердите объединение дубликатов.', fieldExists: 'Поле уже существует.', fieldSetting: 'Настройки поля', - changesAffectNewParses: 'Изменения применяются только к новым парсингам.', + changesAffectNewParses: + 'Изменения применяются только к новым парсингам.', // editMetadataForDataset: 'View and edit metadata for ', restrictDefinedValues: 'Ограничить заданными значениями', metadataGenerationSettings: 'Настройки генерации метаданных', @@ -250,8 +250,7 @@ export default { fieldName: 'Имя поля', editMetadata: 'Редактировать метаданные', addMetadata: 'Добавить метаданные', - deleteWarn: - 'Поле «{{field}}» будет удалено из всех связанных файлов', + deleteWarn: 'Поле «{{field}}» будет удалено из всех связанных файлов', deleteManageFieldAllWarn: 'Это поле и все его значения будут удалены из всех связанных файлов.', deleteManageValueAllWarn: @@ -264,7 +263,8 @@ export default { deleteSettingValueWarn: `Значение будет удалено; существующие метаданные не затронуты.`, }, redoAll: 'Очистите существующие фрагменты', - applyAutoMetadataSettings: 'Применить глобальные настройки авто-метаданных', + applyAutoMetadataSettings: + 'Применить глобальные настройки авто-метаданных', parseFileTip: 'Запустить парсинг?', parseFile: 'Парсить файл', emptyMetadata: 'Нет метаданных', @@ -539,7 +539,7 @@ export default { embeddingModelTip: 'Модель эмбеддингов по умолчанию для базы знаний. После появления чанков при смене модели система случайно выбирает несколько чанков для проверки совместимости, перекодирует их новой моделью и вычисляет косинусное сходство. Переключение возможно только если среднее сходство ≥ 0.9. Иначе удалите все чанки в датасете.', permissionsTip: - "Если установлено значение «Команда», все члены команды смогут управлять базой знаний.", + 'Если установлено значение «Команда», все члены команды смогут управлять базой знаний.', chunkTokenNumberTip: 'Это своего рода устанавливает порог токенов для создания чанка. Сегмент с меньшим количеством токенов, чем этот порог, будет объединен со следующими сегментами до тех пор, пока количество токенов не превысит порог, после чего будет создан чанк. Новый чанк не создается, если не встречен разделитель, даже если порог превышен.', chunkMethod: 'Метод чанкинга', @@ -858,7 +858,8 @@ export default { maxTokensTip: `Максимальный размер контекста модели; неверное значение вызовет ошибку. По умолчанию 512.`, maxTokensInvalidMessage: 'Пожалуйста, введите действительное число для макс. токенов.', - maxTokensMinMessage: 'Максимальное количество токенов не может быть меньше 0.', + maxTokensMinMessage: + 'Максимальное количество токенов не может быть меньше 0.', quote: 'Показать цитату', quoteTip: 'Отображать ли исходный текст как ссылку.', selfRag: 'Self-RAG', @@ -984,6 +985,7 @@ export default { selectModelPlaceholder: 'Выберите модель', configureModelTitle: 'Настроить модель', connectorNameTip: 'Укажите понятное имя для коннектора', + syncDeletedFiles: 'Синхронизировать удалённые файлы', confluenceIsCloudTip: 'Отметьте, если это экземпляр Confluence Cloud, снимите для Confluence Server/Data Center', confluenceWikiBaseUrlTip: @@ -1085,8 +1087,7 @@ export default { seafileAccountScopeTip: 'Синхронизирует все библиотеки, видимые для указанного ниже токена API аккаунта.', seafileTokenPanelHeading: 'Укажите один из способов аутентификации:', - seafileTokenPanelAccountBullet: - '— доступ ко всем вашим библиотекам.', + seafileTokenPanelAccountBullet: '— доступ ко всем вашим библиотекам.', seafileTokenPanelLibraryBullet: '— только к одной библиотеке (безопаснее).', seafileValidationAccountTokenRequired: @@ -1195,7 +1196,8 @@ export default { maxTokensTip: `Максимальный размер контекста mодель; недопустимое или неверное значение приведёт к ошибке. По умолчанию 512.`, maxTokensInvalidMessage: 'Пожалуйста, введите действительное число для максимального количества токенов.', - maxTokensMinMessage: 'Максимальное количество токенов не может быть меньше 0.', + maxTokensMinMessage: + 'Максимальное количество токенов не может быть меньше 0.', password: 'Пароль', passwordDescription: 'Пожалуйста, введите ваш текущий пароль, чтобы изменить ваш пароль.', @@ -2792,8 +2794,7 @@ export default { noSessionsFound: 'Сессии не найдены', createFirstSession: 'Создайте первую сессию', noCanvasFound: 'Канвасы не найдены', - deleteSelectedConfirm: - 'Удалить выбранные сеансы ({{count}})?', + deleteSelectedConfirm: 'Удалить выбранные сеансы ({{count}})?', batchDeleteSessions: 'Удалить сессии', }, }, diff --git a/web/src/locales/tr.ts b/web/src/locales/tr.ts index 1bd067cafe..fa3eb96a78 100644 --- a/web/src/locales/tr.ts +++ b/web/src/locales/tr.ts @@ -970,6 +970,7 @@ Bu otomatik etiketleme özelliği, mevcut datasete alanına özgü bilgi katman selectModelPlaceholder: 'Model seçin', configureModelTitle: 'Modeli yapılandır', connectorNameTip: 'Bağlayıcı için açıklayıcı bir ad', + syncDeletedFiles: 'Silinen dosyaları eşitle', confluenceIsCloudTip: 'Bu bir Confluence Cloud örneği ise işaretleyin, Confluence Server/Data Center için işareti kaldırın', confluenceWikiBaseUrlTip: diff --git a/web/src/locales/zh-traditional.ts b/web/src/locales/zh-traditional.ts index 74dd30826f..f65599cff5 100644 --- a/web/src/locales/zh-traditional.ts +++ b/web/src/locales/zh-traditional.ts @@ -762,6 +762,7 @@ export default { '以英文逗號分隔的倉庫 slug,例如:repo-one,repo-two', bitbucketProjectsTip: '以英文逗號分隔的項目鍵,例如:PROJ1,PROJ2', connectorNameTip: '為連接器填寫一個有意義的名稱', + syncDeletedFiles: '同步刪除文件', }, message: { registered: '註冊成功', diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts index 09cbeb8483..e24178eea2 100644 --- a/web/src/locales/zh.ts +++ b/web/src/locales/zh.ts @@ -929,6 +929,7 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于 bitbucketRepositorySlugsTip: '用英文逗号分隔的仓库 slug,例如:repo-one,repo-two', connectorNameTip: '为连接器命名', + syncDeletedFiles: '同步删除文件', githubDescription: '连接 GitHub,可同步 Pull Request 与 Issue 内容用于检索。', airtableDescription: '连接 Airtable,同步指定工作区下指定表格中的文件。', diff --git a/web/src/pages/user-setting/data-source/add-datasource-modal.tsx b/web/src/pages/user-setting/data-source/add-datasource-modal.tsx index 5196ef17b7..64824b8f9e 100644 --- a/web/src/pages/user-setting/data-source/add-datasource-modal.tsx +++ b/web/src/pages/user-setting/data-source/add-datasource-modal.tsx @@ -8,6 +8,9 @@ import { DataSourceFormBaseFields, DataSourceFormDefaultValues, DataSourceFormFields, + getCommonExtraDefaultValues, + getCommonExtraFields, + mergeDataSourceFormValues, } from './constant'; import { IDataSorceInfo } from './interface'; @@ -28,6 +31,7 @@ const AddDataSourceModal = ({ ...DataSourceFormFields[ sourceData.id as keyof typeof DataSourceFormFields ], + ...getCommonExtraFields(sourceData.id), ] as FormFieldConfig[]); } }, [sourceData]); @@ -59,9 +63,12 @@ const AddDataSourceModal = ({ console.log(data); }} defaultValues={ - DataSourceFormDefaultValues[ - sourceData?.id as keyof typeof DataSourceFormDefaultValues - ] as FieldValues + mergeDataSourceFormValues( + DataSourceFormDefaultValues[ + sourceData?.id as keyof typeof DataSourceFormDefaultValues + ] as FieldValues, + getCommonExtraDefaultValues(), + ) as FieldValues } labelClassName="font-normal" > diff --git a/web/src/pages/user-setting/data-source/component/box-token-field.tsx b/web/src/pages/user-setting/data-source/component/box-token-field.tsx index 40603551a7..eccdf2dda5 100644 --- a/web/src/pages/user-setting/data-source/component/box-token-field.tsx +++ b/web/src/pages/user-setting/data-source/component/box-token-field.tsx @@ -319,7 +319,7 @@ const BoxTokenField = ({ value, onChange }: BoxTokenFieldProps) => { ]); return ( -
+
{(hasConfigured || hasAuthorized) && (
@@ -342,7 +342,11 @@ const BoxTokenField = ({ value, onChange }: BoxTokenFieldProps) => {
)} - diff --git a/web/src/pages/user-setting/data-source/component/gmail-token-field.tsx b/web/src/pages/user-setting/data-source/component/gmail-token-field.tsx index 7df7ec6d55..186281d918 100644 --- a/web/src/pages/user-setting/data-source/component/gmail-token-field.tsx +++ b/web/src/pages/user-setting/data-source/component/gmail-token-field.tsx @@ -340,7 +340,7 @@ const GmailTokenField = ({ }, [resetDialog]); return ( -
+
{(credentialSummary || hasVerifiedTokens || hasUploadedButUnverified || diff --git a/web/src/pages/user-setting/data-source/component/google-drive-token-field.tsx b/web/src/pages/user-setting/data-source/component/google-drive-token-field.tsx index 1077a349c1..8d182fdbba 100644 --- a/web/src/pages/user-setting/data-source/component/google-drive-token-field.tsx +++ b/web/src/pages/user-setting/data-source/component/google-drive-token-field.tsx @@ -337,7 +337,7 @@ const GoogleDriveTokenField = ({ }, [resetDialog]); return ( -
+
{(credentialSummary || hasVerifiedTokens || hasUploadedButUnverified || diff --git a/web/src/pages/user-setting/data-source/constant/index.tsx b/web/src/pages/user-setting/data-source/constant/index.tsx index aad84d5777..80022cbc94 100644 --- a/web/src/pages/user-setting/data-source/constant/index.tsx +++ b/web/src/pages/user-setting/data-source/constant/index.tsx @@ -1,4 +1,4 @@ -import { FormFieldType } from '@/components/dynamic-form'; +import { FormFieldConfig, FormFieldType } from '@/components/dynamic-form'; import { IconFontFill } from '@/components/icon-font'; import SvgIcon from '@/components/svg-icon'; import { t, TFunction } from 'i18next'; @@ -46,6 +46,29 @@ export enum DataSourceKey { // TEAMS = 'teams', } +type DataSourceFeatureVisibility = { + syncDeletedFiles?: boolean; +}; + +type DataSourceFormValues = Record; + +export const DataSourceFeatureVisibilityMap = { + [DataSourceKey.GITHUB]: { + syncDeletedFiles: true, + }, +}; + +const isDataSourceFeatureVisible = ( + source?: DataSourceKey, + feature?: keyof DataSourceFeatureVisibility, +) => { + if (!source || !feature) { + return false; + } + + return Boolean(DataSourceFeatureVisibilityMap[source]?.[feature]); +}; + export const generateDataSourceInfo = (t: TFunction) => { return { [DataSourceKey.RSS]: { @@ -200,6 +223,30 @@ export const useDataSourceInfo = () => { return { dataSourceInfo }; }; +const isPlainObject = (value: unknown): value is DataSourceFormValues => + typeof value === 'object' && value !== null && !Array.isArray(value); + +export const mergeDataSourceFormValues = ( + ...values: Array +): DataSourceFormValues => + values.reduce((result, current) => { + if (!current) { + return result; + } + + const next = { ...result }; + + Object.entries(current).forEach(([key, value]) => { + if (isPlainObject(value) && isPlainObject(next[key])) { + next[key] = mergeDataSourceFormValues(next[key], value); + } else { + next[key] = value; + } + }); + + return next; + }, {}); + export const DataSourceFormBaseFields = [ { id: 'Id', @@ -227,6 +274,26 @@ export const DataSourceFormBaseFields = [ })), }, ]; + +export const getCommonExtraFields = ( + source?: DataSourceKey, +): FormFieldConfig[] => [ + { + label: t('setting.syncDeletedFiles'), + name: 'config.sync_deleted_files', + type: FormFieldType.Checkbox, + required: false, + defaultValue: false, + shouldRender: () => isDataSourceFeatureVisible(source, 'syncDeletedFiles'), + }, +]; + +export const getCommonExtraDefaultValues = () => ({ + config: { + sync_deleted_files: false, + }, +}); + export const DataSourceFormFields = { [DataSourceKey.RSS]: [ { diff --git a/web/src/pages/user-setting/data-source/data-source-detail-page/index.tsx b/web/src/pages/user-setting/data-source/data-source-detail-page/index.tsx index 89a54093d0..63ea3ff4d0 100644 --- a/web/src/pages/user-setting/data-source/data-source-detail-page/index.tsx +++ b/web/src/pages/user-setting/data-source/data-source-detail-page/index.tsx @@ -18,6 +18,9 @@ import { DataSourceFormBaseFields, DataSourceFormDefaultValues, DataSourceFormFields, + getCommonExtraDefaultValues, + getCommonExtraFields, + mergeDataSourceFormValues, useDataSourceInfo, } from '../constant'; import { @@ -166,6 +169,7 @@ const SourceDetailPage = () => { ...DataSourceFormFields[ detail.source as keyof typeof DataSourceFormFields ], + ...getCommonExtraFields(detail.source), ...customFields, ] as FormFieldConfig[]; @@ -179,10 +183,13 @@ const SourceDetailPage = () => { setFields(newFields); const defaultValueTemp = { - ...(DataSourceFormDefaultValues[ - detail?.source as keyof typeof DataSourceFormDefaultValues - ] as FieldValues), - ...detail, + ...mergeDataSourceFormValues( + DataSourceFormDefaultValues[ + detail?.source as keyof typeof DataSourceFormDefaultValues + ] as FieldValues, + getCommonExtraDefaultValues(), + detail as FieldValues, + ), }; console.log('defaultValue', defaultValueTemp); setDefaultValues(defaultValueTemp);