From 48755a33523f4ecd30c587a8ac2014c33e2e9f4d Mon Sep 17 00:00:00 2001 From: Yao Wei <251109226@qq.com> Date: Tue, 3 Mar 2026 14:53:46 +0800 Subject: [PATCH] Fix: (resume) Cross-verify project experience and work experience, and remove duplicate text (#13323) Cross-verify project experience and work experience, and remove duplicate text --------- Co-authored-by: Aron.Yao Co-authored-by: Aron.Yao --- rag/app/resume.py | 177 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) diff --git a/rag/app/resume.py b/rag/app/resume.py index a6cf11eab5..f9f885e126 100644 --- a/rag/app/resume.py +++ b/rag/app/resume.py @@ -1892,7 +1892,142 @@ def _postprocess_resume(resume: dict, lines: list[str], lang: str = "Chinese") - seen.add(item_str) deduped.append(item_str) resume[list_field] = deduped + # --- Phase 3.4: work_desc_tks dedup by company name + time period --- + # LLM often extracts the same company's content twice: once from the "Work Experience" + # section and once from the "Project Experience" section, producing entries like + # These have different descriptions (daily work vs project details), so content-based + # Jaccard dedup cannot catch them. Instead, we detect duplicate companies by checking + # if one company name is a substring of another AND their time periods overlap. + # This also fixes the inflated work_exp_flt (e.g. 25.5 years instead of ~14). + work_descs = resume.get("work_desc_tks", []) + if len(work_descs) > 1: + corp_names = resume.get("corp_nm_tks", []) + work_details = resume.get("_work_exp_details", []) + positions = resume.get("position_name_tks", []) + kept_indices = [] + for i in range(len(work_descs)): + is_dup = False + corp_i = _normalize_for_comparison(corp_names[i]) if i < len(corp_names) else "" + detail_i = work_details[i] if i < len(work_details) else {} + start_i = detail_i.get("start_date", "") + end_i = detail_i.get("end_date", "") + # Parse dates for entry i once (reused across inner loop) + dt_start_i = _parse_date_str(start_i) if start_i else None + dt_end_i = _parse_date_str(end_i) if end_i else None + for j in kept_indices: + # Strategy A: company name substring + time period overlap + corp_j = _normalize_for_comparison(corp_names[j]) if j < len(corp_names) else "" + if corp_i and corp_j: + shorter_c, longer_c = (corp_i, corp_j) if len(corp_i) <= len(corp_j) else (corp_j, corp_i) + if shorter_c in longer_c: + # Check time period overlap using parsed dates + # Two intervals [s1,e1] and [s2,e2] overlap iff s1 <= e2 and s2 <= e1 + # Use <= because resume dates are month-granularity (e.g. "2018.03" means "sometime in March 2018") + detail_j = work_details[j] if j < len(work_details) else {} + start_j = detail_j.get("start_date", "") + end_j = detail_j.get("end_date", "") + dt_start_j = _parse_date_str(start_j) if start_j else None + dt_end_j = _parse_date_str(end_j) if end_j else None + # Need at least one valid date on each side to compare + if dt_start_i and dt_start_j: + # Use far-future as default end if missing + eff_end_i = dt_end_i or datetime.datetime(2099, 12, 1) + eff_end_j = dt_end_j or datetime.datetime(2099, 12, 1) + if dt_start_i <= eff_end_j and dt_start_j <= eff_end_i: + is_dup = True + break + elif (start_i and start_j and start_i == start_j) or \ + (end_i and end_j and end_i == end_j): + # Fallback: exact string match if date parsing fails + is_dup = True + break + # Strategy B: content-based Jaccard similarity (fallback) + norm_i = _normalize_for_comparison(work_descs[i]) + norm_j = _normalize_for_comparison(work_descs[j]) + shorter, longer = (norm_i, norm_j) if len(norm_i) <= len(norm_j) else (norm_j, norm_i) + if shorter and longer and shorter in longer: + is_dup = True + break + jac = _shingling_jaccard(work_descs[i], work_descs[j], n=5) + if jac > 0.5: + is_dup = True + break + if is_dup: + dup_corp = corp_names[i] if i < len(corp_names) else f"#{i+1}" + logger.debug(f"Work desc internal duplicate removed: {dup_corp}") + else: + kept_indices.append(i) + # Only update when entries were actually removed + if len(kept_indices) < len(work_descs): + resume["work_desc_tks"] = [work_descs[i] for i in kept_indices] + if corp_names: + resume["corp_nm_tks"] = [corp_names[i] for i in kept_indices if i < len(corp_names)] + if work_details: + resume["_work_exp_details"] = [work_details[i] for i in kept_indices if i < len(work_details)] + if positions: + resume["position_name_tks"] = [positions[i] for i in kept_indices if i < len(positions)] + # Recalculate work years based on deduplicated entries + new_details = resume.get("_work_exp_details", []) + if new_details: + recalc_years = sum(d.get("years", 0) for d in new_details) + recalc_years = round(recalc_years, 1) + if recalc_years > 0: + resume["work_exp_flt"] = recalc_years + logger.info(f"Work years recalculated: {recalc_years} yrs (before dedup: {_calculate_work_years([{'start_date': d.get('start_date',''), 'end_date': d.get('end_date','')} for d in work_details])} yrs)") + new_corps = resume.get("corp_nm_tks", []) + if new_corps: + resume["corporation_name_tks"] = new_corps[0] + # --- Phase 3.5: Merge project_desc_tks into work_desc_tks --- + # Instead of complex cross-dedup, we simply merge unique project descriptions into + # work_desc_tks and clear project_desc_tks. This avoids the problem where LLM extracts + # the same content into both fields with slightly different wording. + # After merge, project_desc_tks is emptied so _build_chunk_document won't generate + # duplicate chunks. Project names are preserved in project_tks for reference. + work_descs = resume.get("work_desc_tks", []) + project_descs = resume.get("project_desc_tks", []) + # Save pre-merge project descriptions for debugging + resume["_raw_project_descs"] = list(project_descs) if project_descs else [] + if project_descs: + project_names = resume.get("project_tks", []) + merged_count = 0 + skipped_count = 0 + for i, proj_desc in enumerate(project_descs): + norm_proj = _normalize_for_comparison(proj_desc) + if not norm_proj: + continue + # Check if this project desc already exists in work_descs (exact or near-duplicate) + already_exists = False + for wd in work_descs: + norm_wd = _normalize_for_comparison(wd) + if not norm_wd: + continue + # Substring containment check + shorter, longer = (norm_proj, norm_wd) if len(norm_proj) <= len(norm_wd) else (norm_wd, norm_proj) + if shorter in longer: + already_exists = True + break + # Jaccard similarity check + if _shingling_jaccard(proj_desc, wd, n=5) > 0.5: + already_exists = True + break + if already_exists: + skipped_count += 1 + proj_name = project_names[i] if i < len(project_names) else f"#{i+1}" + logger.debug(f"Project desc already in work_desc, skipped: {proj_name}") + else: + # Append to work_desc_tks with project name prefix for context + proj_name = project_names[i] if i < len(project_names) else "" + if proj_name: + proj_desc_with_prefix = f"[{proj_name}] {proj_desc}" + else: + proj_desc_with_prefix = proj_desc + work_descs.append(proj_desc_with_prefix) + merged_count += 1 + resume["work_desc_tks"] = work_descs + # Clear project_desc_tks — all content is now in work_desc_tks + resume["project_desc_tks"] = [] + logger.info(f"Merged project descs into work_desc_tks: {merged_count} merged, {skipped_count} skipped (duplicate)") # --- Phase 4: Field completion --- required_fields = [ "name_kwd", "gender_kwd", "phone_kwd", "email_tks", @@ -2561,3 +2696,45 @@ def _layout_detect_reorder(blocks: list[dict], binary: bytes) -> list[dict]: except Exception as e: logger.warning(f"Layout detector unavailable, falling back to heuristic sorting: {e}") return _layout_aware_reorder(blocks) + + + +def _text_shingles(text: str, n: int = 5) -> set[tuple[int, ...]]: + """ + Generate text fingerprint set using tiktoken BPE tokenization + n-gram shingling. + + Compared to character-level splitting, BPE tokens have better granularity, + and n-grams preserve word order, providing more accurate overlap measurement. + + Args: + text: Original text + n: Shingling window size, default 5 + Returns: + Set of n-gram shingles (each shingle is a tuple of token ids) + """ + if not text or _tiktoken_encoding is None: + return set() + tokens = _tiktoken_encoding.encode(text) + if len(tokens) < n: + # Text too short: return the entire token sequence as a single shingle + return {tuple(tokens)} if tokens else set() + return {tuple(tokens[i:i + n]) for i in range(len(tokens) - n + 1)} + + +def _shingling_jaccard(text1: str, text2: str, n: int = 5) -> float: + """ + Compute Jaccard similarity between two texts using tiktoken shingling. + + Args: + text1: First text + text2: Second text + n: Shingling window size + Returns: + Jaccard similarity [0.0, 1.0] + """ + s1 = _text_shingles(text1, n=n) + s2 = _text_shingles(text2, n=n) + union = s1 | s2 + if not union: + return 1.0 + return len(s1 & s2) / len(union)