From 48755a33523f4ecd30c587a8ac2014c33e2e9f4d Mon Sep 17 00:00:00 2001
From: Yao Wei <251109226@qq.com>
Date: Tue, 3 Mar 2026 14:53:46 +0800
Subject: [PATCH] Fix: (resume) Cross-verify project experience and work
 experience, and remove duplicate text (#13323)

Cross-verify project experience and work experience, and remove
duplicate text

---------

Co-authored-by: Aron.Yao <yaowei@192.168.1.68>
Co-authored-by: Aron.Yao <yaowei@yaoweideMacBook-Pro.local>
---
 rag/app/resume.py | 177 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)

diff --git a/rag/app/resume.py b/rag/app/resume.py
index a6cf11eab5..f9f885e126 100644
--- a/rag/app/resume.py
+++ b/rag/app/resume.py
@@ -1892,7 +1892,142 @@ def _postprocess_resume(resume: dict, lines: list[str], lang: str = "Chinese") -
                     seen.add(item_str)
                     deduped.append(item_str)
             resume[list_field] = deduped
+    # --- Phase 3.4: work_desc_tks dedup by company name + time period ---
+    # LLM often extracts the same company's content twice: once from the "Work Experience"
+    # section and once from the "Project Experience" section, producing entries like
+    # These have different descriptions (daily work vs project details), so content-based
+    # Jaccard dedup cannot catch them. Instead, we detect duplicate companies by checking
+    # if one company name is a substring of another AND their time periods overlap.
+    # This also fixes the inflated work_exp_flt (e.g. 25.5 years instead of ~14).
+    work_descs = resume.get("work_desc_tks", [])
+    if len(work_descs) > 1:
+        corp_names = resume.get("corp_nm_tks", [])
+        work_details = resume.get("_work_exp_details", [])
+        positions = resume.get("position_name_tks", [])
+        kept_indices = []
+        for i in range(len(work_descs)):
+            is_dup = False
+            corp_i = _normalize_for_comparison(corp_names[i]) if i < len(corp_names) else ""
+            detail_i = work_details[i] if i < len(work_details) else {}
+            start_i = detail_i.get("start_date", "")
+            end_i = detail_i.get("end_date", "")
+            # Parse dates for entry i once (reused across inner loop)
+            dt_start_i = _parse_date_str(start_i) if start_i else None
+            dt_end_i = _parse_date_str(end_i) if end_i else None
+            for j in kept_indices:
+                # Strategy A: company name substring + time period overlap
+                corp_j = _normalize_for_comparison(corp_names[j]) if j < len(corp_names) else ""
+                if corp_i and corp_j:
+                    shorter_c, longer_c = (corp_i, corp_j) if len(corp_i) <= len(corp_j) else (corp_j, corp_i)
+                    if shorter_c in longer_c:
+                        # Check time period overlap using parsed dates
+                        # Two intervals [s1,e1] and [s2,e2] overlap iff s1 <= e2 and s2 <= e1
+                        # Use <= because resume dates are month-granularity (e.g. "2018.03" means "sometime in March 2018")
+                        detail_j = work_details[j] if j < len(work_details) else {}
+                        start_j = detail_j.get("start_date", "")
+                        end_j = detail_j.get("end_date", "")
+                        dt_start_j = _parse_date_str(start_j) if start_j else None
+                        dt_end_j = _parse_date_str(end_j) if end_j else None
+                        # Need at least one valid date on each side to compare
+                        if dt_start_i and dt_start_j:
+                            # Use far-future as default end if missing
+                            eff_end_i = dt_end_i or datetime.datetime(2099, 12, 1)
+                            eff_end_j = dt_end_j or datetime.datetime(2099, 12, 1)
+                            if dt_start_i <= eff_end_j and dt_start_j <= eff_end_i:
+                                is_dup = True
+                                break
+                        elif (start_i and start_j and start_i == start_j) or \
+                                (end_i and end_j and end_i == end_j):
+                            # Fallback: exact string match if date parsing fails
+                            is_dup = True
+                            break
+                # Strategy B: content-based Jaccard similarity (fallback)
+                norm_i = _normalize_for_comparison(work_descs[i])
+                norm_j = _normalize_for_comparison(work_descs[j])
+                shorter, longer = (norm_i, norm_j) if len(norm_i) <= len(norm_j) else (norm_j, norm_i)
+                if shorter and longer and shorter in longer:
+                    is_dup = True
+                    break
+                jac = _shingling_jaccard(work_descs[i], work_descs[j], n=5)
+                if jac > 0.5:
+                    is_dup = True
+                    break
+            if is_dup:
+                dup_corp = corp_names[i] if i < len(corp_names) else f"#{i+1}"
+                logger.debug(f"Work desc internal duplicate removed: {dup_corp}")
+            else:
+                kept_indices.append(i)
+        # Only update when entries were actually removed
+        if len(kept_indices) < len(work_descs):
+            resume["work_desc_tks"] = [work_descs[i] for i in kept_indices]
+            if corp_names:
+                resume["corp_nm_tks"] = [corp_names[i] for i in kept_indices if i < len(corp_names)]
+            if work_details:
+                resume["_work_exp_details"] = [work_details[i] for i in kept_indices if i < len(work_details)]
+            if positions:
+                resume["position_name_tks"] = [positions[i] for i in kept_indices if i < len(positions)]
+            # Recalculate work years based on deduplicated entries
+            new_details = resume.get("_work_exp_details", [])
+            if new_details:
+                recalc_years = sum(d.get("years", 0) for d in new_details)
+                recalc_years = round(recalc_years, 1)
+                if recalc_years > 0:
+                    resume["work_exp_flt"] = recalc_years
+                    logger.info(f"Work years recalculated: {recalc_years} yrs (before dedup: {_calculate_work_years([{'start_date': d.get('start_date',''), 'end_date': d.get('end_date','')} for d in work_details])} yrs)")
+            new_corps = resume.get("corp_nm_tks", [])
+            if new_corps:
+                resume["corporation_name_tks"] = new_corps[0]
 
+    # --- Phase 3.5: Merge project_desc_tks into work_desc_tks ---
+    # Instead of complex cross-dedup, we simply merge unique project descriptions into
+    # work_desc_tks and clear project_desc_tks. This avoids the problem where LLM extracts
+    # the same content into both fields with slightly different wording.
+    # After merge, project_desc_tks is emptied so _build_chunk_document won't generate
+    # duplicate chunks. Project names are preserved in project_tks for reference.
+    work_descs = resume.get("work_desc_tks", [])
+    project_descs = resume.get("project_desc_tks", [])
+    # Save pre-merge project descriptions for debugging
+    resume["_raw_project_descs"] = list(project_descs) if project_descs else []
+    if project_descs:
+        project_names = resume.get("project_tks", [])
+        merged_count = 0
+        skipped_count = 0
+        for i, proj_desc in enumerate(project_descs):
+            norm_proj = _normalize_for_comparison(proj_desc)
+            if not norm_proj:
+                continue
+            # Check if this project desc already exists in work_descs (exact or near-duplicate)
+            already_exists = False
+            for wd in work_descs:
+                norm_wd = _normalize_for_comparison(wd)
+                if not norm_wd:
+                    continue
+                # Substring containment check
+                shorter, longer = (norm_proj, norm_wd) if len(norm_proj) <= len(norm_wd) else (norm_wd, norm_proj)
+                if shorter in longer:
+                    already_exists = True
+                    break
+                # Jaccard similarity check
+                if _shingling_jaccard(proj_desc, wd, n=5) > 0.5:
+                    already_exists = True
+                    break
+            if already_exists:
+                skipped_count += 1
+                proj_name = project_names[i] if i < len(project_names) else f"#{i+1}"
+                logger.debug(f"Project desc already in work_desc, skipped: {proj_name}")
+            else:
+                # Append to work_desc_tks with project name prefix for context
+                proj_name = project_names[i] if i < len(project_names) else ""
+                if proj_name:
+                    proj_desc_with_prefix = f"[{proj_name}] {proj_desc}"
+                else:
+                    proj_desc_with_prefix = proj_desc
+                work_descs.append(proj_desc_with_prefix)
+                merged_count += 1
+        resume["work_desc_tks"] = work_descs
+        # Clear project_desc_tks — all content is now in work_desc_tks
+        resume["project_desc_tks"] = []
+        logger.info(f"Merged project descs into work_desc_tks: {merged_count} merged, {skipped_count} skipped (duplicate)")
     # --- Phase 4: Field completion ---
     required_fields = [
         "name_kwd", "gender_kwd", "phone_kwd", "email_tks",
@@ -2561,3 +2696,45 @@ def _layout_detect_reorder(blocks: list[dict], binary: bytes) -> list[dict]:
     except Exception as e:
         logger.warning(f"Layout detector unavailable, falling back to heuristic sorting: {e}")
         return _layout_aware_reorder(blocks)
+
+
+
+def _text_shingles(text: str, n: int = 5) -> set[tuple[int, ...]]:
+    """
+    Generate text fingerprint set using tiktoken BPE tokenization + n-gram shingling.
+
+    Compared to character-level splitting, BPE tokens have better granularity,
+    and n-grams preserve word order, providing more accurate overlap measurement.
+
+    Args:
+        text: Original text
+        n: Shingling window size, default 5
+    Returns:
+        Set of n-gram shingles (each shingle is a tuple of token ids)
+    """
+    if not text or _tiktoken_encoding is None:
+        return set()
+    tokens = _tiktoken_encoding.encode(text)
+    if len(tokens) < n:
+        # Text too short: return the entire token sequence as a single shingle
+        return {tuple(tokens)} if tokens else set()
+    return {tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)}
+
+
+def _shingling_jaccard(text1: str, text2: str, n: int = 5) -> float:
+    """
+    Compute Jaccard similarity between two texts using tiktoken shingling.
+
+    Args:
+        text1: First text
+        text2: Second text
+        n: Shingling window size
+    Returns:
+        Jaccard similarity [0.0, 1.0]
+    """
+    s1 = _text_shingles(text1, n=n)
+    s2 = _text_shingles(text2, n=n)
+    union = s1 | s2
+    if not union:
+        return 1.0
+    return len(s1 & s2) / len(union)