ragflow/api/db/services/file_commit_service.py

#
#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

import datetime
import difflib
import hashlib
import json
import logging
from typing import Optional

from api.db.db_models import DB, FileCommit, FileCommitItem, File, User
from api.db.services.common_service import CommonService
from api.db.services.file_service import FileService
from common import settings
from common.misc_utils import get_uuid
from common.time_utils import current_timestamp, datetime_format

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------
# Artifact-commit extension
# ---------------------------------------------------------------------
# Artifact-page saves used to land in the retired ``ArtifactCommit`` table.
# They now flow through :class:`FileCommitService.record_page_edit`, which
# writes one FileCommit + one FileCommitItem per save with the artifact
# columns populated (title/comments on FileCommit; diff/content_after_*/
# slug_kwd/page_type_kwd on FileCommitItem).
#
# ``file_id`` for these commits is a stable content-hash of ``(kb_id, slug)``
# so per-page history queries can filter on it without a real File row —
# no pseudo-File / virtual-folder machinery is created, so the workspace
# UI stays free of ghost entries.
#
# ``folder_id`` is set to ``kb_id`` directly. The datasets URL prefix
# (``/datasets/<kb_id>/commits``) resolves the entity id to itself for
# this scope; workspace file-commit browsing still uses ``/folders/*`` or
# ``/workspace/*`` with the real folder id.
#
# Content storage for ``content_after`` is switched by a module-level
# constant so ops can move blobs between MinIO and the doc-store index
# without touching the schema.
ARTIFACT_CONTENT_STORAGE = "minio"  # one of {"minio", "es"}
_ARTIFACT_COMMIT_BUCKET_PREFIX = ".artifact_commits"
_ARTIFACT_ES_KWD = "artifact_commit_content"


def _artifact_file_id(kb_id: str, slug: str) -> str:
    """Deterministic 32-char id for the artifact-page 'file' identity.

    Not a real File row — just an index key that groups all commits for
    the same page. Hashed so slugs longer than 32 chars still fit.
    """
    return hashlib.md5(f"{kb_id}:{slug}".encode("utf-8")).hexdigest()


def _unified_diff(before: str, after: str, slug: str) -> str:
    """Return a unified diff between two markdown strings, or '' if equal."""
    if (before or "") == (after or ""):
        return ""
    return "".join(
        difflib.unified_diff(
            (before or "").splitlines(keepends=True),
            (after or "").splitlines(keepends=True),
            fromfile=f"a/{slug}",
            tofile=f"b/{slug}",
            n=3,
        )
    )


def _store_content_after(kb_id: str, content: str) -> tuple[str, str]:
    """Persist ``content`` per :data:`ARTIFACT_CONTENT_STORAGE`. Returns
    ``(storage_kind, location)`` for the row's persistence columns.

    Content-addressed by SHA-256 so re-saves with identical bodies share
    the same blob.
    """
    content_bytes = (content or "").encode("utf-8")
    content_hash = hashlib.sha256(content_bytes).hexdigest()

    if ARTIFACT_CONTENT_STORAGE == "minio":
        location = f"{_ARTIFACT_COMMIT_BUCKET_PREFIX}/{content_hash}"
        try:
            storage = settings.STORAGE_IMPL
            if storage is not None:
                storage.put(kb_id, location, content_bytes)
        except Exception:
            logging.exception(
                "record_page_edit: MinIO put failed for kb=%s hash=%s",
                kb_id,
                content_hash,
            )
        return "minio", location

    if ARTIFACT_CONTENT_STORAGE == "es":
        # Store as a single doc-store row so the same connector serves
        # reads. The row is not retrievable (available_int=0).
        from rag.nlp import search as _rag_search

        index = _rag_search.index_name(kb_id)  # kb-scoped index namespace
        payload = {
            "id": content_hash,
            "kb_id": kb_id,
            "doc_id": kb_id,
            "compile_kwd": _ARTIFACT_ES_KWD,
            "content_with_weight": content or "",
            "available_int": 0,
        }
        try:
            settings.docStoreConn.insert([payload], index, kb_id)
        except Exception:
            logging.exception(
                "record_page_edit: ES insert failed for kb=%s hash=%s",
                kb_id,
                content_hash,
            )
        return "es", content_hash

    # Unknown storage kind — fall through with empty location; the
    # detail path treats missing location as "content not recoverable".
    logging.warning(
        "record_page_edit: unknown ARTIFACT_CONTENT_STORAGE=%r; content not persisted",
        ARTIFACT_CONTENT_STORAGE,
    )
    return "", ""


def _read_content_after(kb_id: str, storage_kind: str, location: str) -> str:
    """Fetch the previously-stored artifact ``content_after`` blob.

    Returns ``""`` when the location is empty (workspace commits) or the
    blob is missing.
    """
    if not location:
        return ""
    try:
        if storage_kind == "minio":
            storage = settings.STORAGE_IMPL
            if storage is None:
                return ""
            raw = storage.get(kb_id, location)
            if isinstance(raw, (bytes, bytearray)):
                return raw.decode("utf-8", errors="replace")
            return str(raw or "")
        if storage_kind == "es":
            from rag.nlp import search as _rag_search

            index = _rag_search.index_name(kb_id)
            row = settings.docStoreConn.get(location, index, [kb_id])
            if isinstance(row, dict):
                return row.get("content_with_weight") or ""
            return ""
    except Exception:
        logging.exception(
            "get_page_commit: content read failed kb=%s storage=%s loc=%s",
            kb_id,
            storage_kind,
            location,
        )
    return ""


def _get_file_parent_id(file_id):
    """Look up a file's parent_id from the File table."""
    try:
        row = File.get_or_none(File.id == file_id)
        if row:
            return row.parent_id
    except Exception:
        pass
    return None


def _collect_all_files_under(folder_id):
    """Recursively collect all non-folder files under a folder (including sub-folders).

    Returns a dict of {file_id: File_model_instance}.
    """
    results = {}
    try:
        # Direct file children (non-folder) of this folder
        for f in File.select().where(
            File.parent_id == folder_id,
            File.id != folder_id,
            File.type != "folder",
        ):
            results[f.id] = f
        # Sub-folders — recurse
        for sub in File.select().where(
            File.parent_id == folder_id,
            File.type == "folder",
        ):
            results.update(_collect_all_files_under(sub.id))
    except Exception:
        pass
    return results


class FileCommitService(CommonService):
    model = FileCommit

    @classmethod
    def create_commit(cls, folder_id, author_id, message, file_changes):
        """Create a new commit for a workspace folder.

        Args:
            folder_id: The workspace folder ID
            author_id: The user ID
            message: Commit message
            file_changes: List of dicts:
                [{"file_id": str, "file_name": str, "operation": "add"|"modify"|"delete"|"rename",
                  "content": str (optional, for add/modify), "content_hash": str (optional),
                  "old_name": str, "new_name": str (for rename)}]

        Returns:
            The created FileCommit instance
        """
        commit_id = get_uuid()
        now_ts = current_timestamp()
        now_dt = datetime_format(date_time=datetime.datetime.now())

        with DB.atomic():
            # 1. Get the latest (chain head) commit for this folder
            latest_commit = cls._get_latest_commit(folder_id)

            # 2. Begin creating the commit record
            commit_data = {
                "id": commit_id,
                "folder_id": folder_id,
                "parent_id": latest_commit.id if latest_commit else None,
                "message": message,
                "author_id": author_id,
                "file_count": len(file_changes),
                "create_time": now_ts,
                "create_date": now_dt,
                "update_time": now_ts,
                "update_date": now_dt,
            }

            # 3. Insert commit record
            FileCommit(**commit_data).save(force_insert=True)

            # 4. Build new tree state and process each file change
            tree_state = {}
            if latest_commit and latest_commit.tree_state:
                try:
                    tree_state = json.loads(latest_commit.tree_state)
                except (json.JSONDecodeError, TypeError):
                    tree_state = {}

            # 4a. Backfill parent_id for existing entries that lack it
            for fid, entry in tree_state.items():
                if isinstance(entry, dict) and "parent_id" not in entry:
                    pid = _get_file_parent_id(fid)
                    if pid:
                        entry["parent_id"] = pid

            storage_impl = settings.STORAGE_IMPL

            for change in file_changes:
                op = change.get("operation", "modify")
                file_id = change.get("file_id", "")
                commit_item_id = get_uuid()

                item = {
                    "id": commit_item_id,
                    "commit_id": commit_id,
                    "file_id": file_id,
                    "operation": op,
                    "create_time": now_ts,
                    "create_date": now_dt,
                    "update_time": now_ts,
                    "update_date": now_dt,
                }

                if op == "add":
                    content = change.get("content", "")
                    content_bytes = content.encode("utf-8") if isinstance(content, str) else content
                    content_hash = hashlib.sha256(content_bytes).hexdigest()
                    obj_key = f".objects/{content_hash}"

                    # Store blob via content-addressable storage
                    if storage_impl:
                        storage_impl.put(folder_id, obj_key, content_bytes)

                    item["new_hash"] = content_hash
                    item["new_location"] = obj_key

                    # Update file record in DB
                    File.update(
                        {
                            "location": obj_key,
                            "size": len(content_bytes),
                            "update_time": current_timestamp(),
                        }
                    ).where(File.id == file_id).execute()

                    # Update tree state
                    file_parent = _get_file_parent_id(file_id)
                    tree_state[file_id] = {
                        "hash": content_hash,
                        "location": obj_key,
                        "name": change.get("file_name", ""),
                        "size": len(content_bytes),
                        "status": "1",
                        "parent_id": file_parent,
                    }

                elif op == "modify":
                    content = change.get("content", "")
                    content_bytes = content.encode("utf-8") if isinstance(content, str) else content
                    content_hash = hashlib.sha256(content_bytes).hexdigest()
                    obj_key = f".objects/{content_hash}"

                    # Record old hash
                    old_entry = tree_state.get(file_id, {})
                    old_hash = old_entry.get("hash", "")
                    old_location = old_entry.get("location", "")

                    if old_hash:
                        item["old_hash"] = old_hash
                        item["old_location"] = old_location

                    # Store new blob
                    if storage_impl:
                        storage_impl.put(folder_id, obj_key, content_bytes)

                    item["new_hash"] = content_hash
                    item["new_location"] = obj_key

                    # Update file record
                    File.update(
                        {
                            "location": obj_key,
                            "size": len(content_bytes),
                            "update_time": current_timestamp(),
                        }
                    ).where(File.id == file_id).execute()

                    # Update tree state
                    file_parent = _get_file_parent_id(file_id)
                    tree_state[file_id] = {
                        "hash": content_hash,
                        "location": obj_key,
                        "name": change.get("file_name", tree_state.get(file_id, {}).get("name", "")),
                        "size": len(content_bytes),
                        "status": "1",
                        "parent_id": file_parent,
                    }

                elif op == "delete":
                    old_entry = tree_state.get(file_id, {})
                    old_hash = old_entry.get("hash", "")
                    old_location = old_entry.get("location", "")
                    if old_hash:
                        item["old_hash"] = old_hash
                        item["old_location"] = old_location

                    # Soft-delete the file record
                    File.update(status="0", update_time=current_timestamp()).where(File.id == file_id).execute()

                    # Remove from tree state (mark deleted)
                    if file_id in tree_state:
                        tree_state[file_id]["status"] = "0"

                elif op == "rename":
                    old_name = change.get("old_name", "")
                    new_name = change.get("new_name", "")
                    item["old_name"] = old_name
                    item["new_name"] = new_name

                    # Update the file record name
                    File.update(name=new_name, update_time=current_timestamp()).where(File.id == file_id).execute()

                    # Update tree state
                    if file_id in tree_state:
                        tree_state[file_id]["name"] = new_name

                # Insert commit item
                FileCommitItem(**item).save(force_insert=True)

            # 5. Save the tree state snapshot
            tree_json = json.dumps(tree_state, ensure_ascii=False)
            cls.model.update(tree_state=tree_json).where(cls.model.id == commit_id).execute()

        _, commit = cls.get_by_id(commit_id)
        return commit

    @classmethod
    def _get_latest_commit(cls, folder_id):
        """Get the latest (chain head) commit for a folder."""
        try:
            return cls.model.select().where(cls.model.folder_id == folder_id).order_by(cls.model.create_time.desc()).first()
        except Exception:
            return None

    @classmethod
    @DB.connection_context()
    def list_commits(cls, folder_id, page=1, page_size=15, order_by="create_time", desc=True):
        """List commits for a workspace folder with pagination."""
        total = cls.model.select().where(cls.model.folder_id == folder_id).count()

        query = cls.model.select().where(cls.model.folder_id == folder_id)
        if desc:
            query = query.order_by(getattr(cls.model, order_by).desc())
        else:
            query = query.order_by(getattr(cls.model, order_by).asc())

        if page and page_size:
            offset = (page - 1) * page_size
            query = query.offset(offset).limit(page_size)

        return list(query), total

    @classmethod
    @DB.connection_context()
    def get_commit(cls, commit_id):
        """Get a single commit by ID."""
        success, commit = cls.get_by_id(commit_id)
        return commit if success else None

    @classmethod
    @DB.connection_context()
    def list_commit_files(cls, commit_id):
        """List all file change items for a commit."""
        items = FileCommitItem.select().where(FileCommitItem.commit_id == commit_id)
        return list(items)

    @classmethod
    @DB.connection_context()
    def diff_commits(cls, from_id, to_id):
        """Compare two commits and return the diff.

        Compares tree_state snapshots (full file inventories), not commit
        items (which only capture per-commit deltas).  Falls back to
        FileCommitItem records for supplementary metadata (hash/location).

        Returns list of dicts with fields:
            file_id, file_name, operation, old_hash, new_hash, old_location, new_location
        """
        _, from_commit = cls.get_by_id(from_id)
        _, to_commit = cls.get_by_id(to_id)

        from_tree = {}
        to_tree = {}
        if from_commit and from_commit.tree_state:
            try:
                from_tree = json.loads(from_commit.tree_state)
            except Exception:
                pass
        if to_commit and to_commit.tree_state:
            try:
                to_tree = json.loads(to_commit.tree_state)
            except Exception:
                pass

        # Supplement with commit_item metadata for operations not captured
        # by tree_state alone (rename).
        from_items = {}
        try:
            for item in FileCommitItem.select().where(FileCommitItem.commit_id == from_id):
                from_items[item.file_id] = item
        except Exception:
            pass
        to_items = {}
        try:
            for item in FileCommitItem.select().where(FileCommitItem.commit_id == to_id):
                to_items[item.file_id] = item
        except Exception:
            pass

        all_file_ids = set(from_tree.keys()) | set(to_tree.keys())

        diff = []
        for fid in sorted(all_file_ids):
            from_entry = from_tree.get(fid)
            to_entry = to_tree.get(fid)

            from_item = from_items.get(fid)
            to_item = to_items.get(fid)

            from_hash = from_entry.get("hash", "") if isinstance(from_entry, dict) else ""
            to_hash = to_entry.get("hash", "") if isinstance(to_entry, dict) else ""
            from_status = from_entry.get("status", "1") if isinstance(from_entry, dict) else "1"
            to_status = to_entry.get("status", "1") if isinstance(to_entry, dict) else "1"
            from_name = from_entry.get("name", "") if isinstance(from_entry, dict) else ""
            to_name = to_entry.get("name", "") if isinstance(to_entry, dict) else ""

            if from_entry is not None and to_entry is None:
                # Present in from, absent in to → deleted
                diff.append(
                    {
                        "file_id": fid,
                        "file_name": from_name,
                        "operation": "delete",
                        "old_hash": from_hash or (from_item.new_hash if from_item else None),
                        "old_location": from_entry.get("location", "") if isinstance(from_entry, dict) else None,
                        "new_hash": None,
                        "new_location": None,
                    }
                )

            elif from_entry is None and to_entry is not None:
                # Present in to, absent in from → added
                diff.append(
                    {
                        "file_id": fid,
                        "file_name": to_name,
                        "operation": "add",
                        "old_hash": None,
                        "old_location": None,
                        "new_hash": to_hash or (to_item.new_hash if to_item else None),
                        "new_location": to_entry.get("location", "") if isinstance(to_entry, dict) else None,
                    }
                )

            else:
                # Both exist — check for changes
                changed = False
                operation = "modify"

                # Hash change
                if from_hash != to_hash:
                    changed = True

                # Status change (active ↔ deleted or vice versa in same entry)
                if from_status != to_status:
                    changed = True
                    operation = "delete" if to_status == "0" else "add"

                # Name change (rename)
                if from_name != to_name:
                    changed = True
                    operation = "rename"

                if changed:
                    old_loc = from_entry.get("location", "") if isinstance(from_entry, dict) else None
                    new_loc = to_entry.get("location", "") if isinstance(to_entry, dict) else None
                    diff.append(
                        {
                            "file_id": fid,
                            "file_name": to_name or from_name,
                            "operation": operation,
                            "old_hash": from_hash or (from_item.new_hash if from_item else None),
                            "old_location": old_loc or (from_item.new_location if from_item else None),
                            "new_hash": to_hash or (to_item.new_hash if to_item else None),
                            "new_location": new_loc or (to_item.new_location if to_item else None),
                        }
                    )

        return diff

    @classmethod
    @DB.connection_context()
    def get_uncommitted_changes(cls, folder_id):
        """Get uncommitted changes by comparing current File table with latest commit.

        Recursively scans all sub-folders under folder_id.
        Returns list of dicts: [{"file_id", "file_name", "operation": "add"|"modify"|"delete"}]
        """
        # Get latest commit's tree state
        latest = cls._get_latest_commit(folder_id)
        committed_files = {}
        if latest and latest.tree_state:
            try:
                committed_files = json.loads(latest.tree_state)
            except Exception:
                pass

        # Get all current (live) files recursively under this folder
        current_files = _collect_all_files_under(folder_id)

        changes = []
        processed = set()

        # Check for modified and deleted files
        for fid, committed_entry in committed_files.items():
            processed.add(fid)
            if committed_entry.get("status") == "0":
                continue

            if fid in current_files:
                live_file = current_files[fid]
                live_hash = _compute_file_hash(folder_id, fid)
                committed_hash = committed_entry.get("hash", "")
                if live_hash and live_hash != committed_hash:
                    changes.append(
                        {
                            "file_id": fid,
                            "file_name": committed_entry.get("name", ""),
                            "operation": "modify",
                        }
                    )
            else:
                if FileService.get_or_none(id=fid) is None:
                    changes.append(
                        {
                            "file_id": fid,
                            "file_name": committed_entry.get("name", ""),
                            "operation": "delete",
                        }
                    )

        # Check for newly added files
        for fid, live_file in current_files.items():
            if fid not in processed:
                changes.append(
                    {
                        "file_id": fid,
                        "file_name": live_file.name,
                        "operation": "add",
                    }
                )

        return changes

    @classmethod
    @DB.connection_context()
    def get_commit_tree(cls, commit_id):
        """Get the tree state snapshot for a commit as a hierarchical tree."""
        success, commit = cls.get_by_id(commit_id)
        if not success or not commit.tree_state:
            return {}
        try:
            tree_state = json.loads(commit.tree_state)
        except Exception:
            return {}
        return _build_hierarchical_tree(tree_state, commit.folder_id)

    @classmethod
    @DB.connection_context()
    def get_commit_file_content(cls, folder_id, commit_id, file_id):
        """Get file content as it existed in a given commit.

        Resolves the file's stored hash from the commit's tree_state first;
        if absent (file unchanged in this commit), walks the parent commit
        chain via parent_id until a FileCommitItem for the file is found.
        """
        success, commit = cls.get_by_id(commit_id)
        if not success:
            return None

        # 1. Try tree_state — the full snapshot at this commit
        if commit.tree_state:
            try:
                tree = json.loads(commit.tree_state)
                entry = tree.get(file_id)
                if isinstance(entry, dict):
                    h = entry.get("hash")
                    if h:
                        obj_path = f".objects/{h}"
                        storage_impl = settings.STORAGE_IMPL
                        if storage_impl:
                            return storage_impl.get(folder_id, obj_path)
            except Exception:
                pass

        # 2. Walk parent commits via parent_id until we find a
        #    FileCommitItem for this file_id.
        current_id = commit_id
        visited = set()
        while current_id and current_id not in visited:
            visited.add(current_id)
            item = (
                FileCommitItem.select()
                .where(
                    FileCommitItem.commit_id == current_id,
                    FileCommitItem.file_id == file_id,
                )
                .first()
            )
            if item and item.new_hash:
                obj_path = f".objects/{item.new_hash}"
                storage_impl = settings.STORAGE_IMPL
                if storage_impl:
                    return storage_impl.get(folder_id, obj_path)
            # Move to parent
            parent_commit = cls.get_commit(current_id)
            if parent_commit and parent_commit.parent_id:
                current_id = parent_commit.parent_id
            else:
                break

        return None

    # ------------------------------------------------------------------
    # Artifact-page commit surface
    # ------------------------------------------------------------------

    @classmethod
    def record_page_edit(
        cls,
        *,
        tenant_id: str,
        kb_id: str,
        page_type: str,
        slug: str,
        content_before: str,
        content_after: str,
        title: Optional[str] = None,
        comments: Optional[str] = None,
        user_id: Optional[str] = None,
    ) -> Optional[str]:
        """Persist one artifact-page edit as a FileCommit + FileCommitItem.

        Returns the new commit id, or ``None`` when the diff is empty
        (no-op save — skipped per the documented v1 contract).

        Bypasses :func:`create_commit` because artifact commits have no
        real ``File`` row backing them and don't participate in the
        workspace ``tree_state`` snapshot chain.
        """
        diff_text = _unified_diff(content_before or "", content_after or "", slug)
        if not diff_text:
            return None

        title_ts = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
        final_title = f"{(title or '').strip() or f'{title_ts} {slug}'} "
        commit_id = get_uuid()
        item_id = get_uuid()
        file_id = _artifact_file_id(kb_id, slug)
        now_ts = current_timestamp()
        now_dt = datetime_format(date_time=datetime.datetime.now())

        # Persist the post-save markdown per the configured storage.
        # A failure here logs but doesn't block the commit row — the diff
        # is still meaningful without content_after.
        storage_kind, location = _store_content_after(kb_id, content_after or "")

        # Chain to the previous commit for this page so the history stays
        # ordered even under concurrent writes (auto-regen + user edit).
        parent = (
            FileCommit.select(FileCommit.id)
            .join(
                FileCommitItem,
                on=(FileCommitItem.commit_id == FileCommit.id),
            )
            .where((FileCommit.folder_id == kb_id) & (FileCommitItem.file_id == file_id))
            .order_by(FileCommit.create_time.desc())
            .first()
        )
        parent_id = parent.id if parent else None

        try:
            with DB.atomic():
                FileCommit(
                    id=commit_id,
                    folder_id=kb_id,
                    parent_id=parent_id,
                    # ``message`` stays populated with the same string as
                    # ``title`` so any generic file-commit consumer still
                    # renders something sensible.
                    message=final_title[:512],
                    author_id=user_id or "",
                    file_count=1,
                    tree_state=None,
                    title=final_title[:255],
                    comments=comments or "",
                    create_time=now_ts,
                    create_date=now_dt,
                    update_time=now_ts,
                    update_date=now_dt,
                ).save(force_insert=True)

                FileCommitItem(
                    id=item_id,
                    commit_id=commit_id,
                    file_id=file_id,
                    operation="modify" if content_before else "add",
                    diff=diff_text,
                    content_after_storage=storage_kind or None,
                    content_after_location=location or None,
                    slug_kwd=slug,
                    page_type_kwd=page_type,
                    create_time=now_ts,
                    create_date=now_dt,
                    update_time=now_ts,
                    update_date=now_dt,
                ).save(force_insert=True)
        except Exception:
            logging.exception(
                "record_page_edit: insert failed for kb=%s slug=%s",
                kb_id,
                slug,
            )
            return None

        return commit_id

    @classmethod
    @DB.connection_context()
    def list_page_commits(
        cls,
        tenant_id: str,
        kb_id: str,
        slug: str,
        page: int = 1,
        page_size: int = 50,
    ) -> tuple[int, list[dict]]:
        """Return (total, items) for one artifact page's history.

        Filters by ``FileCommitItem.slug_kwd``; joins User for nickname.
        Heavy columns (``diff``, ``content_after``) are excluded — the
        detail path fetches them lazily.
        """
        page = max(int(page or 1), 1)
        page_size = max(min(int(page_size or 50), 200), 1)
        file_id = _artifact_file_id(kb_id, slug)

        base = (
            FileCommit.select(
                FileCommit.id,
                FileCommit.title,
                FileCommit.comments,
                FileCommit.author_id,
                FileCommit.create_time,
                FileCommit.create_date,
            )
            .join(FileCommitItem, on=(FileCommitItem.commit_id == FileCommit.id))
            .where((FileCommit.folder_id == kb_id) & (FileCommitItem.file_id == file_id) & (FileCommitItem.slug_kwd == slug))
        )
        total = base.count()
        rows = list(base.order_by(FileCommit.create_time.desc()).paginate(page, page_size).dicts())
        # Preserve the previous response key so callers only re-key once.
        for r in rows:
            r["user_id"] = r.pop("author_id", None)

        user_ids = {r["user_id"] for r in rows if r.get("user_id")}
        nickname_by_id: dict[str, str] = {}
        if user_ids:
            try:
                for u in User.select(User.id, User.nickname).where(User.id.in_(list(user_ids))).dicts():
                    nickname_by_id[u["id"]] = u.get("nickname") or ""
            except Exception:
                logging.exception(
                    "list_page_commits: nickname lookup failed",
                )
        for r in rows:
            r["user_nickname"] = nickname_by_id.get(r.get("user_id") or "", "")
        return total, rows

    @classmethod
    @DB.connection_context()
    def get_page_commit_detail(
        cls,
        tenant_id: str,
        kb_id: str,
        commit_id: str,
    ) -> Optional[dict]:
        """Return one artifact commit including ``diff`` +
        ``content_after`` (resolved from storage), or ``None`` when not
        found. Scoped by ``folder_id == kb_id`` so a leaked commit id
        can't be read cross-tenant.
        """
        commit = FileCommit.get_or_none(
            (FileCommit.id == commit_id) & (FileCommit.folder_id == kb_id),
        )
        if commit is None:
            return None
        item = FileCommitItem.get_or_none(FileCommitItem.commit_id == commit_id)
        if item is None:
            return None

        content_after = _read_content_after(
            kb_id,
            item.content_after_storage or "",
            item.content_after_location or "",
        )

        nickname = ""
        if commit.author_id:
            try:
                u = User.get_or_none(User.id == commit.author_id)
                if u is not None:
                    nickname = u.nickname or ""
            except Exception:
                pass

        return {
            "id": commit.id,
            "tenant_id": tenant_id,
            "kb_id": kb_id,
            "page_type_kwd": item.page_type_kwd,
            "slug": item.slug_kwd,
            "user_id": commit.author_id or None,
            "user_nickname": nickname,
            "title": commit.title,
            "comments": commit.comments,
            "diff": item.diff,
            "content_after": content_after,
            "create_time": commit.create_time,
            "create_date": commit.create_date,
        }

    @classmethod
    @DB.connection_context()
    def get_file_version_history(cls, file_id):
        """Get version history for a specific file across all commits.

        Returns list of dicts: [{"commit_id", "operation", "hash", "create_time", "message"}]
        """
        items = FileCommitItem.select().where(FileCommitItem.file_id == file_id).order_by(FileCommitItem.create_time.desc())

        versions = []
        for item in items:
            commit = cls.get_commit(item.commit_id)
            if commit:
                versions.append(
                    {
                        "commit_id": item.commit_id,
                        "operation": item.operation,
                        "hash": item.new_hash or item.old_hash or "",
                        "create_time": item.create_time,
                        "message": commit.message,
                    }
                )

        return versions


def _lookup_folder_name(folder_id):
    """Look up a folder's display name from the File table."""
    try:
        row = File.get_or_none(File.id == folder_id)
        if row:
            return row.name
    except Exception:
        pass
    return folder_id


def _build_hierarchical_tree(tree_state, root_folder_id):
    """Build a recursive tree from a flat tree_state map.

    Returns {id, name, type: "folder", children: [{file|folder nodes}]}
    Sub-folder hierarchy is resolved from the File table's parent_id.
    """
    # Collect all unique folder IDs from parent_id fields
    folder_ids = {root_folder_id}
    for fid, entry in tree_state.items():
        if isinstance(entry, dict):
            pid = entry.get("parent_id") or root_folder_id
            folder_ids.add(pid)

    # Build a map of folder_id -> parent_folder_id from the File table
    folder_parent_map = {}
    for fid in folder_ids:
        if fid != root_folder_id:
            try:
                row = File.get_or_none(File.id == fid)
                if row:
                    folder_parent_map[fid] = row.parent_id
            except Exception:
                pass

    # Group file entries by parent_id
    files_by_parent = {}
    for fid, entry in tree_state.items():
        if not isinstance(entry, dict):
            continue
        pid = entry.get("parent_id") or root_folder_id
        files_by_parent.setdefault(pid, []).append((fid, entry))

    # Group sub-folder IDs by their parent folder
    children_by_folder = {}
    for sfid, ppid in folder_parent_map.items():
        children_by_folder.setdefault(ppid, []).append(sfid)

    def _build_node(node_id):
        node = {
            "id": node_id,
            "name": _lookup_folder_name(node_id),
            "type": "folder",
            "children": [],
        }
        # File children
        for fid, entry in files_by_parent.get(node_id, []):
            fn = {"id": fid, "name": entry.get("name", fid), "type": "file", "hash": entry.get("hash", ""), "size": entry.get("size", 0), "status": entry.get("status", "1")}
            if entry.get("location"):
                fn["location"] = entry["location"]
            node["children"].append(fn)
        # Sub-folder children (resolved from File table)
        for sfid in children_by_folder.get(node_id, []):
            child = _build_node(sfid)
            if child:
                node["children"].append(child)
        return node

    return _build_node(root_folder_id)


def _compute_file_hash(folder_id, file_id):
    """Compute SHA256 hash of current file content from storage."""
    try:
        file_record = FileService.get_by_id(file_id)
        if not file_record[0]:
            return None
        file = file_record[1]
        if not file.location:
            return None

        storage = settings.STORAGE_IMPL
        if not storage:
            return None

        data = storage.get(folder_id, file.location)
        if data:
            return hashlib.sha256(data).hexdigest()
        return None
    except Exception:
        return None