From 59bb184e63e47b7cb706a104f8cc3096bca02821 Mon Sep 17 00:00:00 2001 From: Jack Storment <88656337+jack-stormentswe@users.noreply.github.com> Date: Thu, 7 May 2026 11:44:46 +0200 Subject: [PATCH] feat(moodle): support deleted-file sync (#14548) Fixes #14551 ### What problem does this PR solve? The Moodle connector did not let the sync runner clean up indexed documents that were deleted from the source. Other connectors such as dropbox, seafile, webdav, and rss already do this through a slim snapshot pass. This PR adds the same support for Moodle. When `sync_deleted_files` is on, the runner now asks the Moodle connector for a lightweight list of every module id that could be indexed. The runner then compares this list with the index and removes any indexed document whose id is not in the list. The slim pass does not download files. It only goes through courses and modules and yields ids. The id format matches the ids that the loader produces, so the match is exact. ### Type of change - [x] New Feature (non-breaking change which adds functionality) ### Notes - `MoodleConnector` now also implements `SlimConnectorWithPermSync`. - New `retrieve_all_slim_docs_perm_sync` yields slim docs with the same ids the loader uses (`moodle_resource_`, `moodle_forum_`, `moodle_page_`, `moodle_book_`, `moodle_assign_`, `moodle_quiz_`). - The `Moodle` sync class now returns `(document_generator, file_list)` so the runner can do the cleanup. If the slim snapshot fails, `file_list` is set back to `None` and the run continues without cleanup. - The web data source map exposes `syncDeletedFiles` for Moodle so the option shows up in the UI. ### How was this tested? - `ruff check` passes on the changed Python files. - Manual review of the produced slim ids against the ids the loader builds in `_process_resource`, `_process_forum`, `_process_page`, `_process_book`, and `_process_activity`. - Behavior parity with the merged dropbox (#14476), seafile (#14499), webdav (#14491), and rss (#14493) PRs. --- common/data_source/moodle_connector.py | 81 ++++++++++++++++++- rag/svr/sync_data_source.py | 25 +++++- .../chat/app-settings/chat-basic-settings.tsx | 17 +++- web/src/pages/next-search/search-setting.tsx | 31 ++++--- .../data-source/constant/index.tsx | 3 + web/src/utils/tests/chat.test.ts | 2 +- 6 files changed, 140 insertions(+), 19 deletions(-) diff --git a/common/data_source/moodle_connector.py b/common/data_source/moodle_connector.py index 39efcf07be..850ce5815d 100644 --- a/common/data_source/moodle_connector.py +++ b/common/data_source/moodle_connector.py @@ -21,14 +21,19 @@ from common.data_source.interfaces import ( LoadConnector, PollConnector, SecondsSinceUnixEpoch, + SlimConnectorWithPermSync, +) +from common.data_source.models import ( + Document, + GenerateSlimDocumentOutput, + SlimDocument, ) -from common.data_source.models import Document from common.data_source.utils import batch_generator, rl_requests logger = logging.getLogger(__name__) -class MoodleConnector(LoadConnector, PollConnector): +class MoodleConnector(LoadConnector, PollConnector, SlimConnectorWithPermSync): """Moodle LMS connector for accessing course content""" def __init__(self, moodle_url: str, batch_size: int = INDEX_BATCH_SIZE) -> None: @@ -137,6 +142,78 @@ class MoodleConnector(LoadConnector, PollConnector): self._get_updated_content(courses, start, end) ) + @staticmethod + def _slim_doc_id_for_module(module) -> Optional[str]: + """Return the indexed document id for a Moodle module, or None. + + The id format must match the ones produced by the _process_* + helpers below. Module types that we never ingest (label, url) and + modules with no id return None. + """ + mtype = getattr(module, "modname", None) + mid = getattr(module, "id", None) + if not mtype or mid is None: + return None + if mtype in ("label", "url"): + return None + if mtype == "resource": + return f"moodle_resource_{mid}" + if mtype == "forum": + return f"moodle_forum_{mid}" + if mtype == "page": + return f"moodle_page_{mid}" + if mtype == "book": + return f"moodle_book_{mid}" + if mtype in ("assign", "quiz"): + return f"moodle_{mtype}_{mid}" + return None + + def retrieve_all_slim_docs_perm_sync( + self, + callback: Any = None, + ) -> GenerateSlimDocumentOutput: + """List the ids of every Moodle module that could be indexed. + + This is a lightweight pass over courses and modules with no file + downloads. The caller compares the returned ids against the index + and removes any indexed document whose id is not in this list. + """ + del callback + if not self.moodle_client: + raise ConnectorMissingCredentialError("Moodle client not initialized") + + logger.info("Starting Moodle slim snapshot for stale-document cleanup") + courses = self._get_enrolled_courses() + if not courses: + logger.warning("No courses found for slim snapshot") + return + + batch: list[SlimDocument] = [] + total = 0 + for course in courses: + try: + contents = self._get_course_contents(course.id) + for section in contents: + for module in section.modules: + slim_id = self._slim_doc_id_for_module(module) + if slim_id is None: + continue + batch.append(SlimDocument(id=slim_id)) + total += 1 + if len(batch) >= self.batch_size: + yield batch + batch = [] + except Exception as e: + self._log_error( + f"slim snapshot for course {getattr(course, 'fullname', '?')}", + e, + ) + + if batch: + yield batch + + logger.info(f"Moodle slim snapshot completed: {total} documents listed") + @retry(tries=3, delay=1, backoff=2) def _get_enrolled_courses(self) -> list: if not self.moodle_client: diff --git a/rag/svr/sync_data_source.py b/rag/svr/sync_data_source.py index 697e3d5dee..b5801905db 100644 --- a/rag/svr/sync_data_source.py +++ b/rag/svr/sync_data_source.py @@ -976,19 +976,40 @@ class Moodle(SyncBase): # Determine the time range for synchronization based on reindex or poll_range_start poll_start = task.get("poll_range_start") + file_list = None if task["reindex"] == "1" or poll_start is None: document_generator = self.connector.load_from_state() _begin_info = "totally" else: + # Freeze the poll end time BEFORE the slim snapshot so that the + # snapshot and the poll cover the same point in time. Without + # this, a module created between the snapshot and the poll + # could be polled as new and at the same time be missing from + # the slim list, which would mark it as stale and delete it. + end_ts = datetime.now(timezone.utc).timestamp() + + if self.conf.get("sync_deleted_files"): + file_list = [] + try: + for slim_batch in self.connector.retrieve_all_slim_docs_perm_sync(): + file_list.extend(slim_batch) + except Exception: + logging.exception( + "Moodle slim snapshot failed; skipping stale-document cleanup " + "(connector_id=%s, kb_id=%s)", + task.get("connector_id"), + task.get("kb_id"), + ) + file_list = None document_generator = self.connector.poll_source( poll_start.timestamp(), - datetime.now(timezone.utc).timestamp(), + end_ts, ) _begin_info = f"from {poll_start}" self.log_connection("Moodle", self.conf["moodle_url"], task) - return document_generator + return document_generator, file_list class BOX(SyncBase): diff --git a/web/src/pages/next-chats/chat/app-settings/chat-basic-settings.tsx b/web/src/pages/next-chats/chat/app-settings/chat-basic-settings.tsx index 5794787d9a..c0715c7846 100644 --- a/web/src/pages/next-chats/chat/app-settings/chat-basic-settings.tsx +++ b/web/src/pages/next-chats/chat/app-settings/chat-basic-settings.tsx @@ -42,14 +42,23 @@ export default function ChatBasicSetting() { }, [metadataKeys]); useEffect(() => { - const currentFields = form.getValues('prompt_config.reference_metadata.fields'); - if (metadataInclude && Array.isArray(currentFields) && currentFields.length > 0 && metadataKeys) { - const validFields = currentFields.filter((field) => metadataKeys.includes(field)); + const currentFields = form.getValues( + 'prompt_config.reference_metadata.fields', + ); + if ( + metadataInclude && + Array.isArray(currentFields) && + currentFields.length > 0 && + metadataKeys + ) { + const validFields = currentFields.filter((field) => + metadataKeys.includes(field), + ); if (validFields.length !== currentFields.length) { form.setValue('prompt_config.reference_metadata.fields', validFields); } } else if (!metadataInclude) { - form.setValue('prompt_config.reference_metadata.fields', undefined); + form.setValue('prompt_config.reference_metadata.fields', undefined); } }, [kbIds, metadataKeys, metadataInclude, form]); diff --git a/web/src/pages/next-search/search-setting.tsx b/web/src/pages/next-search/search-setting.tsx index d9a381782d..c3c812306d 100644 --- a/web/src/pages/next-search/search-setting.tsx +++ b/web/src/pages/next-search/search-setting.tsx @@ -26,11 +26,7 @@ import { MultiSelect } from '@/components/ui/multi-select'; import { RAGFlowSelect } from '@/components/ui/select'; import { Spin } from '@/components/ui/spin'; import { Switch } from '@/components/ui/switch'; -import { Textarea } from '@/components/ui/textarea'; -import { - useFetchKnowledgeList, - useFetchKnowledgeMetadataKeys, -} from '@/hooks/use-knowledge-request'; +import { useFetchKnowledgeMetadataKeys } from '@/hooks/use-knowledge-request'; import { useComposeLlmOptionsByModelTypes, useSelectLlmOptionsByModelType, @@ -232,14 +228,29 @@ const SearchSetting: React.FC = ({ }, [metadataKeys]); useEffect(() => { - const currentFields = formMethods.getValues('search_config.reference_metadata.fields'); - if (referenceMetadataEnabled && Array.isArray(currentFields) && currentFields.length > 0 && metadataKeys) { - const validFields = currentFields.filter((field) => metadataKeys.includes(field)); + const currentFields = formMethods.getValues( + 'search_config.reference_metadata.fields', + ); + if ( + referenceMetadataEnabled && + Array.isArray(currentFields) && + currentFields.length > 0 && + metadataKeys + ) { + const validFields = currentFields.filter((field) => + metadataKeys.includes(field), + ); if (validFields.length !== currentFields.length) { - formMethods.setValue('search_config.reference_metadata.fields', validFields); + formMethods.setValue( + 'search_config.reference_metadata.fields', + validFields, + ); } } else if (!referenceMetadataEnabled) { - formMethods.setValue('search_config.reference_metadata.fields', undefined); + formMethods.setValue( + 'search_config.reference_metadata.fields', + undefined, + ); } }, [selectedKbIds, metadataKeys, referenceMetadataEnabled, formMethods]); diff --git a/web/src/pages/user-setting/data-source/constant/index.tsx b/web/src/pages/user-setting/data-source/constant/index.tsx index 0aae8868c5..0a5eb8c429 100644 --- a/web/src/pages/user-setting/data-source/constant/index.tsx +++ b/web/src/pages/user-setting/data-source/constant/index.tsx @@ -126,6 +126,9 @@ export const DataSourceFeatureVisibilityMap: Partial< [DataSourceKey.RSS]: { syncDeletedFiles: true, }, + [DataSourceKey.MOODLE]: { + syncDeletedFiles: true, + }, [DataSourceKey.MYSQL]: { syncDeletedFiles: true, }, diff --git a/web/src/utils/tests/chat.test.ts b/web/src/utils/tests/chat.test.ts index 82b4780719..a55ace5cb6 100644 --- a/web/src/utils/tests/chat.test.ts +++ b/web/src/utils/tests/chat.test.ts @@ -23,4 +23,4 @@ test('handles mixed double-escaped delimiters with HTML entities', () => { test('passes through already correct single-escaped delimiters unchanged', () => { const result = preprocessLaTeX('\\(x = 1\\)'); expect(result).toBe('$x = 1$'); -}); \ No newline at end of file +});