From f8c91e8854b7958766e53bcded94db01c183b216 Mon Sep 17 00:00:00 2001 From: Yao Wei <251109226@qq.com> Date: Mon, 2 Mar 2026 19:05:50 +0800 Subject: [PATCH] Refa: Resume parsing module (architectural optimizations based on SmartResume Pipeline) (#13255) Core optimizations (refer to arXiv:2510.09722): 1. PDF text fusion: Metadata + OCR dual-path extraction and fusion 2. Page-aware reconstruction: YOLOv10 page segmentation + hierarchical sorting + line number indexing 3. Parallel task decomposition: Basic information/work experience/educational background three-way parallel LLM extraction 4. Index pointer mechanism: LLM returns a range of line numbers instead of generating the full text, reducing the illusion of full text. --------- Co-authored-by: Aron.Yao Co-authored-by: Aron.Yao Co-authored-by: Yingfeng --- rag/app/resume.py | 2669 ++++++++++++++++++++++++-- rag/prompts/resume_basic_info.md | 39 + rag/prompts/resume_basic_info_en.md | 39 + rag/prompts/resume_education.md | 31 + rag/prompts/resume_education_en.md | 31 + rag/prompts/resume_project_exp.md | 31 + rag/prompts/resume_project_exp_en.md | 31 + rag/prompts/resume_system.md | 3 + rag/prompts/resume_system_en.md | 3 + rag/prompts/resume_work_exp.md | 39 + rag/prompts/resume_work_exp_en.md | 38 + 11 files changed, 2810 insertions(+), 144 deletions(-) create mode 100644 rag/prompts/resume_basic_info.md create mode 100644 rag/prompts/resume_basic_info_en.md create mode 100644 rag/prompts/resume_education.md create mode 100644 rag/prompts/resume_education_en.md create mode 100644 rag/prompts/resume_project_exp.md create mode 100644 rag/prompts/resume_project_exp_en.md create mode 100644 rag/prompts/resume_system.md create mode 100644 rag/prompts/resume_system_en.md create mode 100644 rag/prompts/resume_work_exp.md create mode 100644 rag/prompts/resume_work_exp_en.md diff --git a/rag/app/resume.py b/rag/app/resume.py index b022f81b30..084f8c21b4 100644 --- a/rag/app/resume.py +++ b/rag/app/resume.py @@ -14,167 +14,2548 @@ # limitations under the License. # -import logging -import base64 -import datetime +""" +Resume parsing module (aligned with SmartResume Pipeline architecture optimization) + +Key optimizations (ref: arXiv:2510.09722): + 1. PDF text fusion: metadata + OCR dual-path extraction and fusion + 2. Layout-aware reconstruction: YOLOv10 layout segmentation + hierarchical sorting + line indexing + 3. Parallel task decomposition: basic info / work experience / education - 3-way parallel LLM extraction + 4. Index pointer mechanism: LLM returns line number ranges instead of generating full text, reducing hallucination + 5. Four-stage post-processing: source text re-extraction, domain normalization, context deduplication, source text validation + +Compatibility: + - chunk(filename, binary, callback, **kwargs) signature remains unchanged + - Compatible with FACTORY[ParserType.RESUME.value] in task_executor.py +""" + import json import re -import pandas as pd -import requests -from api.db.services.knowledgebase_service import KnowledgebaseService -from rag.nlp import rag_tokenizer -from deepdoc.parser.resume import refactor -from deepdoc.parser.resume import step_one, step_two -from common.string_utils import remove_redundant_spaces +import random +import datetime +import unicodedata +import concurrent.futures +from io import BytesIO +from typing import Optional +import numpy as np -forbidden_select_fields4resume = [ - "name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", "sch_rank_kwd", "edu_fea_kwd" +# tiktoken for long random string filtering (ref: SmartResume should_remove strategy) +try: + import tiktoken + _tiktoken_encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") +except ImportError: + _tiktoken_encoding = None + +# Long random string pattern: 40+ char alphanumeric mixed strings (hash, token, tracking ID, etc.) +_LONG_RANDOM_PATTERN = re.compile(r'[a-zA-Z0-9\-~_]{40,}') + +import logging as logger +from rag.nlp import rag_tokenizer +from deepdoc.parser.utils import get_text + +# json_repair for fixing malformed JSON from LLM responses (ref: SmartResume fault-tolerance strategy) +try: + import json_repair +except ImportError: + json_repair = None + +# YOLOv10 layout detector (lazy initialization to avoid loading model when unused) +_layout_recognizer = None + + +def _get_layout_recognizer(): + """ + Get YOLOv10 layout detector singleton (lazy loading) + + Uses the existing deepdoc LayoutRecognizer based on layout.onnx model. + + Returns: + LayoutRecognizer instance, or None if loading fails + """ + global _layout_recognizer + if _layout_recognizer is None: + try: + from deepdoc.vision import LayoutRecognizer + _layout_recognizer = LayoutRecognizer("layout") + logger.info("YOLOv10 layout detector loaded successfully") + except Exception as e: + logger.warning(f"YOLOv10 layout detector loading failed, falling back to heuristic sorting: {e}") + _layout_recognizer = False # Mark as failed to avoid repeated attempts + return _layout_recognizer if _layout_recognizer is not False else None + +# ==================== Constants ==================== + +# Fields forbidden from being used as select fields in resume +FORBIDDEN_SELECT_FIELDS = [ + "name_pinyin_kwd", "edu_first_fea_kwd", "degree_kwd", + "sch_rank_kwd", "edu_fea_kwd" ] +# Field name to description mapping (bilingual versions for chunk construction) +FIELD_MAP_ZH = { + "name_kwd": "姓名/名字", + "name_pinyin_kwd": "姓名拼音/名字拼音", + "gender_kwd": "性别(男,女)", + "age_int": "年龄/岁/年纪", + "phone_kwd": "电话/手机/微信", + "email_tks": "email/e-mail/邮箱", + "position_name_tks": "职位/职能/岗位/职责", + "expect_city_names_tks": "期望城市", + "work_exp_flt": "工作年限/工作年份/N年经验/毕业了多少年", + "corporation_name_tks": "最近就职(上班)的公司/上一家公司", + "first_school_name_tks": "第一学历毕业学校", + "first_degree_kwd": "第一学历", + "highest_degree_kwd": "最高学历", + "first_major_tks": "第一学历专业", + "edu_first_fea_kwd": "第一学历标签", + "degree_kwd": "过往学历", + "major_tks": "学过的专业/过往专业", + "school_name_tks": "学校/毕业院校", + "sch_rank_kwd": "学校标签", + "edu_fea_kwd": "教育标签", + "corp_nm_tks": "就职过的公司/之前的公司/上过班的公司", + "edu_end_int": "毕业年份", + "industry_name_tks": "所在行业", + "birth_dt": "生日/出生年份", + "expect_position_name_tks": "期望职位/期望职能/期望岗位", + "skill_tks": "技能/技术栈/编程语言/框架/工具", + "language_tks": "语言能力/外语水平", + "certificate_tks": "证书/资质/认证", + "project_tks": "项目经验/项目名称", + "work_desc_tks": "工作职责/工作描述", + "project_desc_tks": "项目描述/项目职责", + "self_evaluation_tks": "自我评价/个人优势/个人总结", +} -def remote_call(filename, binary): - q = { - "header": { - "uid": 1, - "user": "kevinhu", - "log_id": filename - }, - "request": { - "p": { - "request_id": "1", - "encrypt_type": "base64", - "filename": filename, - "langtype": '', - "fileori": base64.b64encode(binary).decode('utf-8') - }, - "c": "resume_parse_module", - "m": "resume_parse" +FIELD_MAP_EN = { + "name_kwd": "Name", + "name_pinyin_kwd": "Name Pinyin", + "gender_kwd": "Gender (Male, Female)", + "age_int": "Age", + "phone_kwd": "Phone/Mobile/WeChat", + "email_tks": "Email", + "position_name_tks": "Position/Title/Role", + "expect_city_names_tks": "Preferred City", + "work_exp_flt": "Years of Experience", + "corporation_name_tks": "Most Recent Company", + "first_school_name_tks": "First Degree School", + "first_degree_kwd": "First Degree", + "highest_degree_kwd": "Highest Degree", + "first_major_tks": "First Degree Major", + "edu_first_fea_kwd": "First Degree Tag", + "degree_kwd": "Past Degrees", + "major_tks": "Past Majors", + "school_name_tks": "School/University", + "sch_rank_kwd": "School Tag", + "edu_fea_kwd": "Education Tag", + "corp_nm_tks": "Past Companies", + "edu_end_int": "Graduation Year", + "industry_name_tks": "Industry", + "birth_dt": "Date of Birth", + "expect_position_name_tks": "Preferred Position/Role", + "skill_tks": "Skills/Tech Stack/Languages/Frameworks/Tools", + "language_tks": "Language Proficiency", + "certificate_tks": "Certificates/Qualifications", + "project_tks": "Project Experience/Project Name", + "work_desc_tks": "Job Responsibilities/Description", + "project_desc_tks": "Project Description/Responsibilities", + "self_evaluation_tks": "Self-Evaluation/Personal Strengths/Summary", +} + + +def _is_english(lang: str) -> bool: + """Determine if the language parameter indicates English""" + return lang.lower() in ("english", "en") + + +def get_field_map(lang: str) -> dict: + """Get the corresponding field mapping based on language parameter""" + return FIELD_MAP_EN if _is_english(lang) else FIELD_MAP_ZH + + +# Backward compatible: default to Chinese version +FIELD_MAP = FIELD_MAP_ZH + + +# ==================== Parallel LLM Extraction Prompt Templates ==================== +# Ref: SmartResume task decomposition strategy, splitting extraction into independent subtasks +# Each prompt ends with /no_think marker to suppress reasoning model's thinking output +# Prompts loaded from md files under rag/prompts/, supporting bilingual versions + +from rag.prompts.template import load_prompt + + +def _load_resume_prompt(name: str, lang: str) -> str: + """Load the corresponding version of resume prompt template based on language parameter + + Args: + name: Prompt name (without language suffix), e.g. "resume_system" + lang: Language parameter, e.g. "Chinese" or "English" + Returns: + Prompt template string + """ + suffix = "_en" if _is_english(lang) else "" + return load_prompt(f"{name}{suffix}") + + +def get_system_prompt(lang: str) -> str: + """Get system prompt""" + return _load_resume_prompt("resume_system", lang) + + +def get_basic_info_prompt(lang: str) -> str: + """Get basic info extraction prompt""" + return _load_resume_prompt("resume_basic_info", lang) + + +def get_work_exp_prompt(lang: str) -> str: + """Get work experience extraction prompt""" + return _load_resume_prompt("resume_work_exp", lang) + + +def get_education_prompt(lang: str) -> str: + """Get education background extraction prompt""" + return _load_resume_prompt("resume_education", lang) + + +def get_project_exp_prompt(lang: str) -> str: + """Get project experience extraction prompt""" + return _load_resume_prompt("resume_project_exp", lang) + + +# Backward compatible: default Chinese version constants (for possible external direct references) +SYSTEM_PROMPT = load_prompt("resume_system") +BASIC_INFO_PROMPT = load_prompt("resume_basic_info") +WORK_EXP_PROMPT = load_prompt("resume_work_exp") +EDUCATION_PROMPT = load_prompt("resume_education") +PROJECT_EXP_PROMPT = load_prompt("resume_project_exp") + +# LLM call max retry count (ref: SmartResume retry strategy) +_LLM_MAX_RETRIES = 2 + + +def _normalize_whitespace(text: str) -> str: + """ + Unicode whitespace normalization (ref: SmartResume _clean_text_content) + + Replaces various Unicode spaces (\u00A0 non-breaking space, \u3000 fullwidth space, + \u2000-\u200A various width spaces, etc.) with regular spaces, + then applies NFKC normalization (fullwidth to halfwidth) and merges consecutive spaces. + + Args: + text: Original text + Returns: + Normalized text + """ + if not text: + return "" + # NFKC normalization (fullwidth to halfwidth, etc.) + text = unicodedata.normalize('NFKC', text) + # Unify various Unicode spaces to regular space + text = re.sub( + r'[\u0020\u00A0\u1680\u2000-\u200A\u2028\u2029\u202F\u205F\u3000\u00A7]', + ' ', text + ) + # Merge consecutive spaces + text = re.sub(r' {2,}', ' ', text) + return text.strip() + + +def _should_remove_random_str(match: re.Match) -> bool: + """ + Determine if a matched long string is a meaningless random string (ref: SmartResume should_remove) + + Uses tiktoken encoding to judge: if token count exceeds 50% of original char count, + it indicates a meaningless random string (hash, token, tracking ID, etc.) that should be removed. + Normal English words have high token encoding efficiency, with token count far less than char count. + + Args: + match: Regex match object + Returns: + True means it should be removed + """ + if _tiktoken_encoding is None: + # When tiktoken is unavailable, use simple heuristic: case/digit alternation frequency + s = match.group(0) + changes = sum( + 1 for i in range(1, len(s)) + if s[i].isdigit() != s[i-1].isdigit() + or (s[i].isalpha() and s[i-1].isalpha() and s[i].isupper() != s[i-1].isupper()) + ) + return changes / len(s) > 0.3 + encoded = _tiktoken_encoding.encode(match.group(0)) + return len(encoded) > len(match.group(0)) * 0.5 + + +def _clean_line_content(text: str) -> str: + """ + Clean single line text content (Unicode normalization + long random string filtering) + + Args: + text: Original line text + Returns: + Cleaned text + """ + if not text: + return "" + # Unicode whitespace normalization + text = _normalize_whitespace(text) + # Filter long random strings (hash, token and other meaningless content) + text = _LONG_RANDOM_PATTERN.sub( + lambda m: '' if _should_remove_random_str(m) else m.group(0), + text + ) + # Clean up extra spaces after filtering + text = re.sub(r' {2,}', ' ', text).strip() + return text + + +# ==================== Phase 1: PDF Text Fusion and Layout Reconstruction ==================== + + + + +def _is_noise_char(obj: dict) -> bool: + """ + Determine if a PDF character object is a decorative layer noise character + + Uses a "body text whitelist" strategy instead of enumerating noise features, + to handle noise patterns from different resume templates: + + Two reliable features of body text characters (either one means body text): + 1. Embedded font: Font name format is XXXXXX+FontName (contains '+'), + indicating the font is embedded in the PDF, chosen by the document author + 2. Structure tag: Has PDF Tagged Structure tags (e.g., Span, P, NonStruct, etc.), + indicating the character belongs to the document's semantic structure tree + + Common features of noise characters: + - Uses system fonts (e.g., Helvetica, Arial), font name doesn't contain '+' + - No structure tags (tag is None or non-semantic tags like 'OC') + - Common in resume template background decorations, watermarks, tracking marks + + Args: + obj: pdfplumber character/text object dictionary + Returns: + True means it's a noise character that should be filtered + """ + # Whitelist condition 1: Embedded font (font name contains '+' prefix) + fontname = obj.get("fontname", "") + if "+" in fontname: + return False # Embedded font = body content + + # Whitelist condition 2: Has PDF structure tag + tag = obj.get("tag") + if tag in ("Span", "NonStruct", "P", "H1", "H2", "H3", "H4", "H5", "H6", + "TD", "TH", "LI", "L", "Table", "TR", "Figure", "Caption"): + return False # Has semantic structure tag = body content + + # Doesn't meet any whitelist condition, treat as noise + return True + + + +def _extract_metadata_text(binary: bytes) -> list[dict]: + """ + Extract text blocks from PDF metadata (with coordinate info) + + Strategy: + 1. Use whitelist strategy to filter decorative layer noise chars (embedded font or structure tag = body text) + 2. Safe fallback: if filtered chars are less than 30% of original, skip filtering to avoid false positives + 3. Use extract_words for word-level extraction (with real coordinates) + 4. Aggregate adjacent words into line-level text blocks by Y coordinate + 5. Additionally extract table content (many resumes use table layouts) + + Args: + binary: PDF file binary content + Returns: + List of text blocks, each containing text, x0, top, x1, bottom, page fields + """ + try: + import pdfplumber + blocks = [] + with pdfplumber.open(BytesIO(binary)) as pdf: + for page_idx, page in enumerate(pdf.pages): + page_width = page.width or 600 + + # Filter decorative layer noise chars (whitelist strategy based on embedded font + structure tag) + # Safe fallback: if filtered chars are less than 30% of original, the PDF's body text + # may use non-embedded fonts without structure tags, skip filtering to avoid false positives + try: + original_char_count = len(page.chars) + filtered_page = page.filter( + lambda obj: not _is_noise_char(obj) + ) + filtered_char_count = len(filtered_page.chars) + if original_char_count > 0 and filtered_char_count < original_char_count * 0.3: + # Filtered out over 70% of chars, likely false positives, fall back to original page + filtered_page = page + except Exception: + filtered_page = page + + # Use extract_words for extraction (with real coordinates) + words = [] + try: + words = filtered_page.extract_words( + keep_blank_chars=False, use_text_flow=True + ) + except Exception: + pass + + if words: + # Aggregate adjacent words into line-level text blocks by Y coordinate + # Words on the same line: top coordinate difference within threshold + line_threshold = 5 # Y coordinate difference threshold (unit: PDF points) + current_line_words = [words[0]] + + def _flush_line(line_words): + """Merge words in a line into a single text block""" + # Sort by x0 to ensure left-to-right order + line_words.sort(key=lambda w: float(w.get("x0", 0))) + texts = [] + for w in line_words: + texts.append(w.get("text", "")) + merged_text = " ".join(texts) + if not merged_text.strip(): + return None + return { + "text": merged_text.strip(), + "x0": float(min(w.get("x0", 0) for w in line_words)), + "top": float(min(w.get("top", 0) for w in line_words)), + "x1": float(max(w.get("x1", 0) for w in line_words)), + "bottom": float(max(w.get("bottom", 0) for w in line_words)), + "page": page_idx, + } + + for w in words[1:]: + w_top = float(w.get("top", 0)) + cur_top = float(current_line_words[0].get("top", 0)) + if abs(w_top - cur_top) <= line_threshold: + current_line_words.append(w) + else: + block = _flush_line(current_line_words) + if block: + blocks.append(block) + current_line_words = [w] + + # Process the last line + if current_line_words: + block = _flush_line(current_line_words) + if block: + blocks.append(block) + else: + # Fall back to extract_text when extract_words fails + page_text = None + try: + page_text = page.extract_text() + except Exception: + pass + if page_text and page_text.strip(): + raw_lines = page_text.split("\n") + line_height = 16 + for i, line in enumerate(raw_lines): + cleaned = line.strip() + if not cleaned: + continue + blocks.append({ + "text": cleaned, + "x0": 0, + "top": i * line_height, + "x1": page_width, + "bottom": i * line_height + line_height - 2, + "page": page_idx, + }) + + # Extract table content from the page + # Many resumes use table layouts (e.g., personal info section), extract_words may miss table structure + try: + tables = page.extract_tables() + if tables: + page_blocks = [b for b in blocks if b["page"] == page_idx] + max_top = max((b["top"] for b in page_blocks), default=0) + 20 + row_height = 16 + + for table in tables: + for row in table: + if not row: + continue + cells = [str(c).strip() for c in row if c and str(c).strip()] + if not cells: + continue + row_text = " | ".join(cells) + # Dedup: check if table content was already extracted by extract_words + is_dup = False + for pb in page_blocks: + if all(c in pb["text"] for c in cells[:2]): + is_dup = True + break + if is_dup: + continue + blocks.append({ + "text": row_text, + "x0": 0, + "top": max_top, + "x1": page_width, + "bottom": max_top + row_height - 2, + "page": page_idx, + }) + max_top += row_height + except Exception as e: + logger.debug(f"PDF table extraction skipped (page {page_idx}): {e}") + return blocks + except Exception as e: + logger.warning(f"PDF metadata extraction failed: {e}") + return [] + +def _extract_ocr_text(binary: bytes, meta_blocks: list[dict] | None = None) -> list[dict]: + """ + Extract OCR text blocks using blackout strategy (with coordinate info). + + Strategy (ref: SmartResume): + 1. Render PDF pages to images + 2. Black out regions already extracted by metadata + 3. Run OCR on the blacked-out image, only recognizing content metadata missed + 4. Eliminates duplication at source, no IoU dedup needed downstream + + Args: + binary: PDF file binary content + meta_blocks: Text blocks from metadata extraction, used to black out existing text regions + Returns: + List of text blocks, each containing text, x0, top, x1, bottom, page fields + """ + if meta_blocks is None: + meta_blocks = [] + try: + import pdfplumber + from deepdoc.vision.ocr import OCR + import numpy as np + + ocr = OCR() + blocks = [] + + with pdfplumber.open(BytesIO(binary)) as pdf: + for page_idx, page in enumerate(pdf.pages): + # Render page to image (resolution=216 = 3x scale, since PDF default is 72 DPI) + img = page.to_image(resolution=216) + page_img = np.array(img.annotated) + + # Scale factor from PDF coordinates to image coordinates + pdf_to_img_scale = 216.0 / 72.0 # = 3.0 + + # Black out metadata-extracted text regions before OCR + page_meta_blocks = [b for b in meta_blocks if b.get("page") == page_idx] + if page_meta_blocks: + page_img = _blackout_text_regions(page_img, meta_blocks, page_idx, pdf_to_img_scale) + + ocr_result = ocr(page_img) + if not ocr_result: + continue + for box_info in ocr_result: + if isinstance(box_info, (list, tuple)) and len(box_info) >= 2: + coords = box_info[0] # Coordinate points + text_info = box_info[1] + text = text_info[0] if isinstance(text_info, (list, tuple)) else str(text_info) + if text.strip() and isinstance(coords, (list, tuple)) and len(coords) >= 4: + # Extract bounding box from four corner points + xs = [p[0] for p in coords if isinstance(p, (list, tuple))] + ys = [p[1] for p in coords if isinstance(p, (list, tuple))] + if xs and ys: + blocks.append({ + "text": text.strip(), + "x0": min(xs), "top": min(ys), + "x1": max(xs), "bottom": max(ys), + "page": page_idx, + }) + return blocks + except Exception as e: + logger.warning(f"OCR extraction failed: {e}") + return [] + + +def _fuse_text_blocks(meta_blocks: list[dict], ocr_blocks: list[dict]) -> list[dict]: + """ + Fuse PDF metadata text and OCR text (blackout strategy version). + + Since the OCR phase already blacks out metadata-extracted regions, OCR only recognizes + content that metadata missed. Therefore this function only needs to: + 1. Filter out garbled blocks from metadata + 2. Directly merge valid metadata blocks and OCR blocks (no IoU dedup needed) + + Args: + meta_blocks: Text blocks from metadata extraction + ocr_blocks: Text blocks from OCR extraction (already deduplicated via blackout strategy) + Returns: + Fused text block list + """ + if not ocr_blocks: + return meta_blocks + if not meta_blocks: + return ocr_blocks + + # Filter out garbled blocks from metadata + valid_meta = [] + garbled_count = 0 + for b in meta_blocks: + if _is_valid_line(b.get("text", "")): + valid_meta.append(b) + else: + garbled_count += 1 + + if garbled_count: + logger.info(f"Detected {garbled_count} garbled blocks in metadata, filtered out") + + # Under blackout strategy, OCR won't re-recognize existing text, just merge directly + fused = valid_meta + ocr_blocks + return fused + + + + +def _layout_aware_reorder(blocks: list[dict]) -> list[dict]: + """ + Layout-aware hierarchical sorting (ref: SmartResume Hierarchical Re-ordering) + + Two-level sorting strategy: + 1. Inter-segment sorting: first by page number, then by Y coordinate (top to bottom), same row by X coordinate (left to right) + 2. Intra-segment sorting: within each logical segment, sort by reading order + + For multi-column resumes, detect column positions by clustering X coordinates, + then sort by column order. + + Args: + blocks: Text block list (with coordinate info) + Returns: + Sorted text block list + """ + if not blocks: + return blocks + + # Group by page + pages = {} + for b in blocks: + pg = b.get("page", 0) + pages.setdefault(pg, []).append(b) + + sorted_blocks = [] + for pg in sorted(pages.keys()): + page_blocks = pages[pg] + + # Detect multi-column layout: by X coordinate median + if len(page_blocks) > 5: + x_centers = [(b["x0"] + b["x1"]) / 2 for b in page_blocks] + x_min, x_max = min(x_centers), max(x_centers) + page_width = x_max - x_min if x_max > x_min else 1 + + # Simple two-column detection: if text blocks are clearly distributed on left and right sides + mid_x = (x_min + x_max) / 2 + left_count = sum(1 for x in x_centers if x < mid_x - page_width * 0.1) + right_count = sum(1 for x in x_centers if x > mid_x + page_width * 0.1) + + if left_count > 3 and right_count > 3: + # Multi-column layout: left column first then right column, each column top to bottom + left_blocks = [b for b in page_blocks if (b["x0"] + b["x1"]) / 2 < mid_x] + right_blocks = [b for b in page_blocks if (b["x0"] + b["x1"]) / 2 >= mid_x] + left_blocks.sort(key=lambda b: (b["top"], b["x0"])) + right_blocks.sort(key=lambda b: (b["top"], b["x0"])) + sorted_blocks.extend(left_blocks) + sorted_blocks.extend(right_blocks) + continue + + # Single-column layout: top to bottom, same row left to right + page_blocks.sort(key=lambda b: (b["top"], b["x0"])) + sorted_blocks.extend(page_blocks) + + return sorted_blocks + + +def _build_indexed_text(blocks: list[dict]) -> tuple[str, list[str], list[dict]]: + """ + + Build indexed text with line numbers (ref: SmartResume Indexed Linearization) + + Merges sorted text blocks into lines and adds a unique index number to each line. + Includes garbled line filtering logic and field label split repair. + Also preserves coordinate info for each line, used for writing position_int etc. to chunks. + + Args: + blocks: Sorted text block list + Returns: + (indexed_text, lines, line_positions) tuple: + - indexed_text: Text string with line numbers + - lines: Original line text list (without line numbers) + - line_positions: Coordinate info for each line, format: + """ + if not blocks: + return "", [], [] + + raw_lines = [] + raw_positions = [] + current_line_parts = [] + current_line_blocks = [] + current_top = blocks[0].get("top", 0) + current_layoutno = blocks[0].get("layoutno", "") + threshold = 10 + + def _merge_line_position(line_blocks: list[dict]) -> dict: + """Merge coordinates of all blocks in a line into outer bounding rectangle""" + return { + "page": line_blocks[0].get("page", 0), + "x0": min(b.get("x0", 0) for b in line_blocks), + "x1": max(b.get("x1", 0) for b in line_blocks), + "top": min(b.get("top", 0) for b in line_blocks), + "bottom": max(b.get("bottom", 0) for b in line_blocks), } - } - for _ in range(3): - try: - resume = requests.post( - "http://127.0.0.1:61670/tog", - data=json.dumps(q)) - resume = resume.json()["response"]["results"] - resume = refactor(resume) - for k in ["education", "work", "project", - "training", "skill", "certificate", "language"]: - if not resume.get(k) and k in resume: - del resume[k] - resume = step_one.refactor(pd.DataFrame([{"resume_content": json.dumps(resume), "tob_resume_id": "x", - "updated_at": datetime.datetime.now().strftime( - "%Y-%m-%d %H:%M:%S")}])) - resume = step_two.parse(resume) - return resume - except Exception: - logging.exception("Resume parser has not been supported yet!") - return {} + for b in blocks: + b_layoutno = b.get("layoutno", "") + y_changed = abs(b.get("top", 0) - current_top) > threshold + layout_changed = b_layoutno != current_layoutno and current_layoutno and b_layoutno + if (y_changed or layout_changed) and current_line_parts: + raw_lines.append(" ".join(current_line_parts)) + raw_positions.append(_merge_line_position(current_line_blocks)) + current_line_parts = [] + current_line_blocks = [] + current_top = b.get("top", 0) + current_layoutno = b_layoutno + current_line_parts.append(b["text"]) + current_line_blocks.append(b) + if current_line_parts: + raw_lines.append(" ".join(current_line_parts)) + raw_positions.append(_merge_line_position(current_line_blocks)) -def chunk(filename, binary=None, callback=None, **kwargs): - """ - The supported file formats are pdf, docx and txt. - To maximize the effectiveness, parse the resume correctly, please concat us: https://github.com/infiniflow/ragflow - """ - if not re.search(r"\.(pdf|doc|docx|txt)$", filename, flags=re.IGNORECASE): - raise NotImplementedError("file type not supported yet(pdf supported)") - - if not binary: - with open(filename, "rb") as f: - binary = f.read() - - callback(0.2, "Resume parsing is going on...") - resume = remote_call(filename, binary) - if len(resume.keys()) < 7: - callback(-1, "Resume is not successfully parsed.") - raise Exception("Resume parser remote call fail!") - callback(0.6, "Done parsing. Chunking...") - logging.debug("chunking resume: " + json.dumps(resume, ensure_ascii=False, indent=2)) - - field_map = { - "name_kwd": "姓名/名字", - "name_pinyin_kwd": "姓名拼音/名字拼音", - "gender_kwd": "性别(男,女)", - "age_int": "年龄/岁/年纪", - "phone_kwd": "电话/手机/微信", - "email_tks": "email/e-mail/邮箱", - "position_name_tks": "职位/职能/岗位/职责", - "expect_city_names_tks": "期望城市", - "work_exp_flt": "工作年限/工作年份/N年经验/毕业了多少年", - "corporation_name_tks": "最近就职(上班)的公司/上一家公司", - - "first_school_name_tks": "第一学历毕业学校", - "first_degree_kwd": "第一学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)", - "highest_degree_kwd": "最高学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)", - "first_major_tks": "第一学历专业", - "edu_first_fea_kwd": "第一学历标签(211,留学,双一流,985,海外知名,重点大学,中专,专升本,专科,本科,大专)", - - "degree_kwd": "过往学历(高中,职高,硕士,本科,博士,初中,中技,中专,专科,专升本,MPA,MBA,EMBA)", - "major_tks": "学过的专业/过往专业", - "school_name_tks": "学校/毕业院校", - "sch_rank_kwd": "学校标签(顶尖学校,精英学校,优质学校,一般学校)", - "edu_fea_kwd": "教育标签(211,留学,双一流,985,海外知名,重点大学,中专,专升本,专科,本科,大专)", - - "corp_nm_tks": "就职过的公司/之前的公司/上过班的公司", - "edu_end_int": "毕业年份", - "industry_name_tks": "所在行业", - - "birth_dt": "生日/出生年份", - "expect_position_name_tks": "期望职位/期望职能/期望岗位", - } - - titles = [] - for n in ["name_kwd", "gender_kwd", "position_name_tks", "age_int"]: - v = resume.get(n, "") - if isinstance(v, list): - v = v[0] - if n.find("tks") > 0: - v = remove_redundant_spaces(v) - titles.append(str(v)) - doc = { - "docnm_kwd": filename, - "title_tks": rag_tokenizer.tokenize("-".join(titles) + "-简历") - } - doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) - pairs = [] - for n, m in field_map.items(): - if not resume.get(n): + # Filter empty and garbled lines (sync filter coordinates) + lines = [] + line_positions = [] + for line, pos in zip(raw_lines, raw_positions): + # Unicode normalization + long random string filtering (ref: SmartResume _clean_text_content) + line = _clean_line_content(line) + if not line: continue - v = resume[n] - if isinstance(v, list): - v = " ".join(v) - if n.find("tks") > 0: - v = remove_redundant_spaces(v) - pairs.append((m, str(v))) - - doc["content_with_weight"] = "\n".join( - ["{}: {}".format(re.sub(r"([^()]+)", "", k), v) for k, v in pairs]) - doc["content_ltks"] = rag_tokenizer.tokenize(doc["content_with_weight"]) - doc["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(doc["content_ltks"]) - for n, _ in field_map.items(): - if n not in resume: + # Garbled detection: skip if valid chars (Chinese/ASCII letters/digits/common punctuation) ratio is too low + if not _is_valid_line(line): continue - if isinstance(resume[n], list) and ( - len(resume[n]) == 1 or n not in forbidden_select_fields4resume): - resume[n] = resume[n][0] - if n.find("_tks") > 0: - resume[n] = rag_tokenizer.fine_grained_tokenize(resume[n]) - doc[n] = resume[n] + lines.append(line) + line_positions.append(pos) - logging.debug("chunked resume to " + str(doc)) - KnowledgebaseService.update_parser_config( - kwargs["kb_id"], {"field_map": field_map}) - return [doc] + # Fix field label split issues + # Coordinates are not affected, keep original positions + lines = _fix_split_labels(lines) + + # Build indexed text with line numbers + indexed_parts = [f"[{i}]: {line}" for i, line in enumerate(lines)] + indexed_text = "\n".join(indexed_parts) + + return indexed_text, lines, line_positions + +def _is_valid_line(line: str) -> bool: + """ + Check if a text line is valid content (not garbled) + + Multi-dimensional detection: + 1. Valid character ratio (Chinese, ASCII alphanumeric, common punctuation) + 2. Single-character spacing anomaly detection (PDF custom font mapping causing "O U W Z_W V 2" pattern) + 3. Consecutive meaningless alphanumeric sequence detection + + Args: + line: Text line to check + Returns: + True means valid line, False means garbled line + """ + if len(line) <= 3: + # Short lines may be valid content like names, keep them + return True + + cid_count = len(re.findall(r'\(cid:\d+\)', line)) + if cid_count >= 3: + return False + # Valid characters: Chinese (incl. extension), ASCII alphanumeric, common punctuation and spaces, fullwidth chars, CJK punctuation + valid_chars = re.findall( + r'[\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff' + r'a-zA-Z0-9\s@.,:;!?()()【】\-_/\\|·•' + r'、,。:;!?\u201c\u201d\u2018\u2019《》' + r'\uff01-\uff5e' + r'\u3000-\u303f' + r'#%&+=~`\u00b7\u2022\u2013\u2014' + r']', + line + ) + ratio = len(valid_chars) / len(line) if len(line) > 0 else 0 + if ratio < 0.5: + return False + + # Detect PDF custom font mapping causing single-character spacing anomaly pattern + # Feature: lots of "single letter space single letter space" sequences, e.g. "O U W Z_W V 2 X 3" + # Stats: ratio of space-separated single chars among non-space chars + spaced_singles = re.findall(r'(?:^|\s)([a-zA-Z0-9])(?:\s|$)', line) + non_space_len = len(line.replace(" ", "")) + if non_space_len > 5 and len(spaced_singles) > 0: + # If ratio of space-separated single chars to non-space chars is too high, classify as garbled + single_ratio = len(spaced_singles) / non_space_len + if single_ratio > 0.3: + return False + + # Detect consecutive meaningless mixed-case alphanumeric sequences (e.g. "UJqZX9V2") + # Normal English words don't have such frequent case alternation patterns + garbled_seqs = re.findall(r'[a-zA-Z0-9]{4,}', line.replace(" ", "")) + if garbled_seqs: + garbled_count = 0 + for seq in garbled_seqs: + # Count case alternations + case_changes = sum( + 1 for i in range(1, len(seq)) + if (seq[i].isupper() != seq[i-1].isupper() and seq[i].isalpha() and seq[i-1].isalpha()) + or (seq[i].isdigit() != seq[i-1].isdigit()) + ) + # Too high alternation frequency = garbled sequence (normal words like "Spring" have only 1 alternation) + if len(seq) >= 4 and case_changes / len(seq) > 0.5: + garbled_count += 1 + # If garbled sequence ratio is too high + if len(garbled_seqs) > 0 and garbled_count / len(garbled_seqs) > 0.4: + return False + + return True -if __name__ == "__main__": - import sys +def _fix_split_labels(lines: list[str]) -> list[str]: + """ + Fix field label split issues + + Some PDF layouts split field labels across line start/end, e.g.: + - "名:陈晓俐 姓" -> should be fixed to "姓名:陈晓俐" + - "别:男 性" -> should be fixed to "性别:男" + + Args: + lines: Original line text list + Returns: + Fixed line text list + """ + # Common split field label patterns: (line-end part, line-start part) -> full label + split_patterns = { + ("姓", "名"): "姓名", + ("性", "别"): "性别", + ("年", "龄"): "年龄", + ("电", "话"): "电话", + ("邮", "箱"): "邮箱", + ("学", "历"): "学历", + ("专", "业"): "专业", + ("地", "址"): "地址", + ("籍", "贯"): "籍贯", + ("民", "族"): "民族", + } + + fixed = [] + for line in lines: + # Detect in-line split patterns: "X:content Y" where (Y, X) is a split pair + for (suffix_char, prefix_char), full_label in split_patterns.items(): + # Pattern: "prefix_char:content suffix_char" (first half at line start, second half at line end) + pattern = rf'^({re.escape(prefix_char)})\s*[::]\s*(.+?)\s+{re.escape(suffix_char)}\s*$' + m = re.match(pattern, line) + if m: + content = m.group(2).strip() + line = f"{full_label}:{content}" + break + # Pattern: "suffix_char content prefix_char:" (second half at line start, first half at line end) + pattern2 = rf'^{re.escape(suffix_char)}\s*[::]?\s*(.+?)\s+{re.escape(prefix_char)}\s*$' + m2 = re.match(pattern2, line) + if m2: + content = m2.group(1).strip() + line = f"{full_label}:{content}" + break + fixed.append(line) + return fixed - def dummy(a, b): + + + +def extract_text(filename: str, binary: bytes) -> tuple[str, list[str], list[dict]]: + """ + Extract text content based on file type (Pipeline Phase 1). + + PDF files use dual-path fusion + layout reconstruction + line indexing. + Other formats fall back to simple text extraction. + + Args: + filename: File name + binary: File binary content + Returns: + (indexed_text, lines, line_positions) tuple: + - indexed_text: Text with line number indices + - lines: List of original line texts + - line_positions: List of per-line coordinate info (empty list for non-PDF formats) + """ + fname_lower = filename.lower() + + try: + if fname_lower.endswith(".pdf"): + # Dual-path extraction + meta_blocks = _extract_metadata_text(binary) + ocr_blocks = [] + + # Determine whether OCR supplementation is needed: + # 1. Metadata text too short (< 100 chars) + # 2. High garbled text ratio in metadata (caused by custom font mapping) + meta_text_len = sum(len(b["text"]) for b in meta_blocks) + need_ocr = False + + if meta_text_len < 100: + logger.info("PDF metadata text too short, enabling OCR supplementation") + need_ocr = True + else: + # Check metadata text quality: calculate valid line ratio + # If many lines are judged as garbled by _is_valid_line, the PDF font mapping has issues + valid_line_count = 0 + total_line_count = 0 + for b in meta_blocks: + text = b.get("text", "").strip() + if not text: + continue + total_line_count += 1 + if _is_valid_line(text): + valid_line_count += 1 + if total_line_count > 0: + valid_ratio = valid_line_count / total_line_count + if valid_ratio < 0.6: + logger.info( + f"PDF metadata text quality low (valid line ratio {valid_ratio:.1%}), enabling OCR supplementation" + ) + need_ocr = True + + if need_ocr: + # Blackout strategy: black out metadata-extracted regions before OCR + ocr_blocks = _extract_ocr_text(binary, meta_blocks=meta_blocks) + + # Text fusion + fused_blocks = _fuse_text_blocks(meta_blocks, ocr_blocks) + + # Layout-aware sorting (prefer YOLOv10 layout detection, fall back to heuristic on failure) + sorted_blocks = _layout_detect_reorder(fused_blocks, binary) + + # Build line-indexed text (with coordinate info) + return _build_indexed_text(sorted_blocks) + + elif fname_lower.endswith(".docx"): + from docx import Document + doc = Document(BytesIO(binary)) + lines = [p.text.strip() for p in doc.paragraphs if p.text.strip()] + + # Extract table content from DOCX + # Reference: table handling in naive.py Docx class + # Many resumes use table layouts for personal info; iterating only paragraphs would miss this content + for table in doc.tables: + for row in table.rows: + cells = [] + for cell in row.cells: + cell_text = cell.text.strip() + if cell_text: + cells.append(cell_text) + if not cells: + continue + row_text = " | ".join(cells) + # Deduplicate: skip if this row text already exists in lines + if row_text not in lines: + lines.append(row_text) + + indexed = "\n".join(f"[{i}]: {line}" for i, line in enumerate(lines)) + # DOCX has no coordinate info, return empty list + return indexed, lines, [] + + else: + text = get_text(filename, binary) + lines = [line.strip() for line in text.split("\n") if line.strip()] + indexed = "\n".join(f"[{i}]: {line}" for i, line in enumerate(lines)) + return indexed, lines, [] + + except Exception: + logger.exception(f"Text extraction failed: {filename}") + return "", [], [] + + +# ==================== Phase 2: Parallel LLM Structured Extraction ==================== + + +def _clean_llm_json_response(response: str) -> str: + """ + Clean LLM JSON response. + + Uses SmartResume's lightweight string extraction strategy: + 1. Remove markdown code block markers + 2. Remove ... thinking tags (reasoning models may output these) + 3. text.find("{") and text.rfind("}") to locate valid JSON block + + Args: + response: Raw LLM response text + Returns: + Cleaned JSON string + """ + text = response.strip() + # Remove markdown code block markers + text = text.replace("```json", "").replace("```", "").strip() + # Remove reasoning model thinking tags + text = re.sub(r'.*?', '', text, flags=re.DOTALL).strip() + # Clean escaped quotes (SmartResume's approach) + text = text.replace('\\"', '"') + # SmartResume strategy: locate first { and last } + start = text.find("{") + end = text.rfind("}") + if start != -1 and end != -1 and end > start: + return text[start:end + 1] + return text + + +def _parse_json_with_repair(text: str) -> dict: + """ + Parse JSON string, attempt repair on failure (ref SmartResume's json_repair strategy). + + Repair strategies: + 1. Standard json.loads + 2. Replace Python-style booleans/None + 3. Use json_repair library + + Args: + text: JSON string + Returns: + Parsed dictionary + Raises: + json.JSONDecodeError: Raised when all repair strategies fail + """ + # First attempt: standard parsing + try: + return json.loads(text) + except json.JSONDecodeError: pass + # Second attempt: replace Python-style values (ref SmartResume) + repaired = text.replace("'", '"') + repaired = repaired.replace('True', 'true') + repaired = repaired.replace('False', 'false') + repaired = repaired.replace('None', 'null') + try: + return json.loads(repaired) + except json.JSONDecodeError: + pass - chunk(sys.argv[1], callback=dummy) + # Third attempt: use json_repair library + if json_repair is not None: + try: + return json_repair.loads(text) + except Exception: + pass + + # All strategies failed + raise json.JSONDecodeError("All JSON repair strategies failed", text, 0) + + +def _call_llm(prompt: str, tenant_id , lang: str) -> Optional[dict]: + """ + Call LLM and parse JSON response (ref SmartResume's retry + fault-tolerance strategy). + + Retry mechanism: + - Retry up to _LLM_MAX_RETRIES times + - On retry, increase temperature and randomize seed for output diversity + - Use json_repair on JSON parse failure + + Args: + prompt: User prompt + lang: Language + Returns: + Parsed dictionary, or None on failure + + """ + try: + from api.db.services.llm_service import LLMBundle + from common.constants import LLMType + + llm = LLMBundle(tenant_id, LLMType.CHAT, lang=lang) + + for attempt in range(_LLM_MAX_RETRIES + 1): + try: + # Increase temperature on retry for diversity (ref SmartResume) + temperature = 0.1 if attempt == 0 else 1.0 + gen_conf = {"temperature": temperature, "max_tokens": 2048} + if attempt > 0: + gen_conf["seed"] = random.randint(0, 1000000) + + response = llm.chat( + system=get_system_prompt(lang), + history=[{"role": "user", "content": prompt}], + gen_conf=gen_conf, + ) + cleaned = _clean_llm_json_response(response) + return _parse_json_with_repair(cleaned) + + except json.JSONDecodeError as e: + if attempt < _LLM_MAX_RETRIES: + logger.info(f"LLM JSON parse failed (attempt {attempt + 1}), retrying: {e}") + continue + else: + logger.warning(f"LLM JSON parse failed (retries exhausted): {e}") + return None + + except Exception as e: + logger.warning(f"LLM call failed: {e}") + return None + + +def _normalize_for_comparison(text: str) -> str: + """ + Normalize text for comparison (ref SmartResume's _normalize_for_comparison). + + Unify fullwidth/halfwidth, remove whitespace, Unicode normalization, + so that "阿里巴巴" and "阿 里 巴 巴" can match. + + Args: + text: Original text + Returns: + Normalized text + """ + if not text: + return "" + # Unicode NFKC normalization (fullwidth to halfwidth, etc.) + text = unicodedata.normalize("NFKC", text) + # Remove all whitespace characters + text = re.sub(r'\s+', '', text) + return text.lower() + +def _calc_single_exp_years(start_str: str, end_str: str) -> float: + """ + Calculate years for a single experience entry. + + Args: + start_str: Start date string + end_str: End date string ("至今" etc. means current) + Returns: + Years (float, 1 decimal place), returns 0 if unable to calculate + """ + from datetime import datetime + + start_str = str(start_str).strip() + end_str = str(end_str).strip() + if not start_str: + return 0 + + start_date = _parse_date_str(start_str) + if not start_date: + return 0 + + if end_str in ("至今", "现在", "present", "Present", "now", "Now", ""): + end_date = datetime.now() + else: + end_date = _parse_date_str(end_str) + if not end_date: + end_date = datetime.now() + + months = (end_date.year - start_date.year) * 12 + (end_date.month - start_date.month) + if months <= 0: + return 0 + return round(months / 12.0, 1) + + +def _calculate_work_years(experiences: list[dict]) -> float: + """ + Calculate total work years based on start/end dates of each work experience. + + Args: + experiences: List of work experiences, each containing start_date, end_date fields + Returns: + Total work years (float), returns 0 if unable to calculate + """ + total = 0.0 + for exp in experiences: + total += _calc_single_exp_years( + exp.get("start_date", ""), exp.get("end_date", "") + ) + return round(total, 1) + + +def _parse_date_str(date_str: str) -> Optional[datetime.datetime]: + """ + Parse date string, supporting multiple common formats. + + Supported formats: + - 2024.1 / 2024.01 + - 2024-1 / 2024-01 + - 2024/1 / 2024/01 + - 2024年1月 + - 2024 (year only, defaults to January) + + Args: + date_str: Date string + Returns: + datetime object, or None on parse failure + """ + from datetime import datetime + + date_str = date_str.strip() + # Try matching year.month / year-month / year/month / year(nian)month(yue) formats + patterns = [ + (r"((?:19|20)\d{2})[.\-/年](\d{1,2})", "%Y-%m"), + (r"^((?:19|20)\d{2})$", "%Y"), + ] + for pattern, _ in patterns: + m = re.search(pattern, date_str) + if m: + try: + year = int(m.group(1)) + month = int(m.group(2)) if len(m.groups()) > 1 else 1 + # Month range validation + if month < 1 or month > 12: + month = 1 + return datetime(year, month, 1) + except (ValueError, IndexError): + continue + return None + + + + +def _extract_description_from_range( + index_range: list, lines: list[str], + company: str = "", position: str = "" +) -> str: + """ + Extract description from original text by index range (ref SmartResume's _extract_description_from_range). + + Key improvement: + - Filter out lines containing both company name and position title (avoid mixing header lines into description) + - Boundary safety checks + + Args: + index_range: [start_line_number, end_line_number] + lines: List of original line texts + company: Company name (used to filter header lines) + position: Position title (used to filter header lines) + Returns: + Extracted description text + """ + if not index_range or len(index_range) != 2: + return "" + + start_idx, end_idx = int(index_range[0]), int(index_range[1]) + + # Boundary safety check + if start_idx < 0 or end_idx >= len(lines) or start_idx > end_idx: + return "" + + extracted_lines = lines[start_idx:end_idx + 1] + + # Filter out lines containing both company name and position title (ref SmartResume) + if company or position: + norm_company = _normalize_for_comparison(company) + norm_position = _normalize_for_comparison(position) + filtered = [] + for line in extracted_lines: + norm_line = _normalize_for_comparison(line) + # If a line contains both company name and position title, it's likely a header line, skip + if norm_company and norm_position and norm_company in norm_line and norm_position in norm_line: + continue + # If a line exactly equals company name or position title, also skip + if norm_line == norm_company or norm_line == norm_position: + continue + filtered.append(line) + extracted_lines = filtered + + if not extracted_lines: + return "" + + return "\n".join(line.strip() for line in extracted_lines if line.strip()) + + +def _extract_basic_info(indexed_text: str, tenant_id , lang: str) -> Optional[dict]: + """Extract basic info (subtask 1). + + Basic info is usually at the beginning of the resume, first 8000 chars suffice. + """ + prompt = get_basic_info_prompt(lang).format(indexed_text=indexed_text[:8000]) + return _call_llm(prompt,tenant_id, lang) + + +def _extract_work_experience(indexed_text: str, tenant_id , lang: str) -> Optional[dict]: + """Extract work experience (subtask 2, using index pointers). + + Work experience may span the middle-to-end of the resume, use full text to avoid truncation. + """ + prompt = get_work_exp_prompt(lang).format(indexed_text=indexed_text) + return _call_llm(prompt, tenant_id , lang) + + +def _extract_education(indexed_text: str, tenant_id , lang: str) -> Optional[dict]: + """Extract education background (subtask 3). + + Education is usually at the end of the resume, must use full text to avoid truncation. + Resume text is generally under 30K chars, within LLM context window. + """ + prompt = get_education_prompt(lang).format(indexed_text=indexed_text) + return _call_llm(prompt,tenant_id, lang) + + +def _extract_project_experience(indexed_text: str, tenant_id , lang: str) -> Optional[dict]: + """Extract project experience (subtask 4, using index pointers). + + Project experience may span the middle-to-end of the resume, use full text to avoid truncation. + """ + prompt = get_project_exp_prompt(lang).format(indexed_text=indexed_text) + return _call_llm(prompt, tenant_id , lang) + + +def parse_with_llm(indexed_text: str, lines: list[str], tenant_id , lang: str) -> Optional[dict]: + """ + Extract resume info using parallel task decomposition strategy (ref SmartResume Section 3.2). + + Decomposes extraction into four independent subtasks executed in parallel: + 1. Basic info (name, phone, skills, self-evaluation, etc.) + 2. Work experience (company, position, description line ranges) + 3. Education background (school, major, degree) + 4. Project experience (project name, role, description line ranges) + + Args: + indexed_text: Line-indexed resume text + lines: List of original line texts (for index-based extraction) + lang: Language + Returns: + Merged structured resume dictionary, or None on failure + """ + try: + # Execute four subtasks in parallel + with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: + future_basic = executor.submit(_extract_basic_info, indexed_text, tenant_id , lang) + future_work = executor.submit(_extract_work_experience, indexed_text, tenant_id , lang) + future_edu = executor.submit(_extract_education, indexed_text, tenant_id, lang) + future_project = executor.submit(_extract_project_experience, indexed_text, tenant_id , lang) + + basic_info = future_basic.result(timeout=60) + work_exp = future_work.result(timeout=60) + education = future_edu.result(timeout=60) + project_exp = future_project.result(timeout=60) + + # Merge results + resume = {} + + # Merge basic info + if basic_info: + resume.update(basic_info) + logger.info(f"Basic info extraction succeeded: {len(basic_info)} fields") + + # Process work experience (index pointer extraction) + if work_exp and "workExperience" in work_exp: + experiences = work_exp["workExperience"] + companies = [] + positions = [] + work_descs = [] + # Save detailed info for each experience (dates, years) for chunk generation + work_exp_details = [] + for exp in experiences: + company = exp.get("company", "") + position = exp.get("position", "") + start_date = exp.get("start_date", "") + end_date = exp.get("end_date", "") + # Calculate years for this experience entry + years = _calc_single_exp_years(start_date, end_date) + if company: + companies.append(company) + if position: + positions.append(position) + # Save detailed info for each experience entry + work_exp_details.append({ + "company": company, + "position": position, + "start_date": start_date, + "end_date": end_date, + "years": years, + }) + # Index pointer mechanism: extract description from original text by line range + # Use _extract_description_from_range to filter header lines (ref SmartResume) + desc_lines = exp.get("desc_lines", []) + if isinstance(desc_lines, list) and len(desc_lines) == 2: + desc = _extract_description_from_range( + desc_lines, lines, company=company, position=position + ) + if desc.strip(): + work_descs.append(desc.strip()) + + if companies: + resume["corp_nm_tks"] = companies + resume["corporation_name_tks"] = companies[0] + if positions: + resume["position_name_tks"] = positions + if work_descs: + resume["work_desc_tks"] = work_descs + # Save experience details for _build_chunk_document + if work_exp_details: + resume["_work_exp_details"] = work_exp_details + # Calculate total work years from each experience's dates (overrides LLM's guess in basic info) + calculated_years = _calculate_work_years(experiences) + if calculated_years > 0: + resume["work_exp_flt"] = calculated_years + logger.info(f"Work experience extraction succeeded: {len(experiences)} entries, calculated total years: {calculated_years}") + + # Process education background + if education and "education" in education: + edu_list = education["education"] + schools = [] + majors = [] + degrees = [] + for edu in edu_list: + if edu.get("school"): + schools.append(edu["school"]) + if edu.get("major"): + majors.append(edu["major"]) + if edu.get("degree"): + degrees.append(edu["degree"]) + # Extract graduation year + end_date = edu.get("end_date", "") + if end_date and not resume.get("edu_end_int"): + year_match = re.search(r"(19|20)\d{2}", str(end_date)) + if year_match: + resume["edu_end_int"] = int(year_match.group(0)) + + if schools: + resume["school_name_tks"] = schools + resume["first_school_name_tks"] = schools[-1] # Earliest school is usually last + if majors: + resume["major_tks"] = majors + resume["first_major_tks"] = majors[-1] + if degrees: + resume["degree_kwd"] = degrees + # Infer highest degree (supports both Chinese and English degree names) + degree_rank = { + "博士": 5, "PhD": 5, "Doctor": 5, + "硕士": 4, "Master": 4, "MBA": 4, "EMBA": 4, "MPA": 4, + "本科": 3, "Bachelor": 3, + "大专": 2, "专科": 2, "Associate": 2, "Diploma": 2, + "高中": 1, "High School": 1, + } + highest = max(degrees, key=lambda d: degree_rank.get(d, 0), default="") + if highest: + resume["highest_degree_kwd"] = highest + resume["first_degree_kwd"] = degrees[-1] if degrees else "" + logger.info(f"Education extraction succeeded: {len(edu_list)} entries") + + # Process project experience (index pointer extraction, similar to work experience) + if project_exp and "projectExperience" in project_exp: + projects = project_exp["projectExperience"] + project_names = [] + project_descs = [] + for proj in projects: + name = proj.get("project_name", "") + if name: + project_names.append(name) + # Index pointer mechanism: extract project description from original text by line range + desc_lines = proj.get("desc_lines", []) + if isinstance(desc_lines, list) and len(desc_lines) == 2: + desc = _extract_description_from_range( + desc_lines, lines, company=name, position=proj.get("role", "") + ) + if desc.strip(): + project_descs.append(desc.strip()) + + if project_names: + resume["project_tks"] = project_names + if project_descs: + resume["project_desc_tks"] = project_descs + logger.info(f"Project experience extraction succeeded: {len(projects)} entries") + + if not resume.get("name_kwd"): + resume["name_kwd"] = "Unknown" if _is_english(lang) else "未知" + + return resume if len(resume) > 2 else None + + except concurrent.futures.TimeoutError: + logger.warning("LLM parallel extraction timed out") + return None + except Exception as e: + logger.warning(f"LLM parallel extraction failed: {e}") + return None + + +# ==================== Phase 3: Regex Fallback Parsing ==================== + + + +def parse_with_regex(text: str, lang: str = "Chinese") -> dict: + """ + Parse resume text using regex (fallback strategy) + + When LLM parsing fails, use regex to extract basic structured info from text. + + Args: + text: Resume text content (without line number index) + lang: Language parameter, default "Chinese" + Returns: + Structured resume info dictionary + """ + resume: dict = {} + lines = [line.strip() for line in text.split("\n") if line.strip()] + + # --- Extract Name --- + if _is_english(lang): + # English resume: extract from "Name: XXX" format + for line in lines[:30]: + name_match = re.search(r'(?:Name|Full\s*Name)\s*[::]\s*([A-Za-z][A-Za-z\s\-\.]{1,40})', line, re.IGNORECASE) + if name_match: + resume["name_kwd"] = name_match.group(1).strip() + break + # English resume strategy 2: first line if short text without digits, may be a name + if "name_kwd" not in resume and lines: + first = lines[0].strip() + if len(first) <= 40 and not re.search(r"\d", first) and re.match(r'^[A-Za-z][A-Za-z\s\-\.]+$', first): + resume["name_kwd"] = first + else: + # Chinese resume: extract from "姓名:XXX" format + for line in lines[:30]: + name_match = re.search(r'姓\s*名\s*[::]\s*([\u4e00-\u9fa5]{2,4})', line) + if name_match: + resume["name_kwd"] = name_match.group(1) + break + + # Strategy 2: search first 20 lines for standalone Chinese names (2-4 chars), excluding common title words + if "name_kwd" not in resume: + title_words = { + "个人", "简历", "求职", "应聘", "基本", "信息", "概述", "简介", + "教育", "工作", "经历", "经验", "技能", "项目", "自我", "评价", + "专业", "技术", "证书", "语言", "能力", "培训", "荣誉", "奖项", + } + for line in lines[:20]: + if any(w in line for w in title_words): + continue + if re.search(r'[::]', line) and len(line) > 6: + continue + cleaned = re.sub(r"^[A-Za-z_\-\d\s]+\s+", "", line) + cleaned = re.sub(r"\s+[A-Za-z_\-\d\s]+$", "", cleaned).strip() + if 2 <= len(cleaned) <= 4 and re.match(r"^[\u4e00-\u9fa5]{2,4}$", cleaned): + resume["name_kwd"] = cleaned + break + + # Strategy 3: first line if short without digits, may be a name + if "name_kwd" not in resume and lines: + first = lines[0].strip() + if len(first) <= 10 and not re.search(r"\d", first): + cn_part = re.findall(r'[\u4e00-\u9fa5]+', first) + if cn_part and 2 <= len(cn_part[0]) <= 4: + resume["name_kwd"] = cn_part[0] + + # --- Extract Phone Number --- + phones = re.findall(r"1[3-9]\d{9}", text) + if phones: + resume["phone_kwd"] = phones[0] + + # --- Extract Email --- + emails = re.findall(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}", text) + if emails: + resume["email_tks"] = emails[0] + + # --- Extract Gender --- + if _is_english(lang): + # English resume: extract from "Gender: Male/Female" format + gender_label = re.search(r'(?:Gender|Sex)\s*[::]\s*(Male|Female|M|F)', text, re.IGNORECASE) + if gender_label: + raw = gender_label.group(1).strip().upper() + resume["gender_kwd"] = "Male" if raw in ("M", "MALE") else "Female" + else: + gender_match = re.search(r'\b(Male|Female)\b', text[:500], re.IGNORECASE) + if gender_match: + resume["gender_kwd"] = gender_match.group(1).capitalize() + else: + # Chinese resume: extract from "性别:男/女" format + gender_label = re.search(r'性\s*别\s*[::]\s*(男|女)', text) + if gender_label: + resume["gender_kwd"] = gender_label.group(1) + else: + gender_match = re.search(r"(男|女)", text[:500]) + if gender_match: + resume["gender_kwd"] = gender_match.group(1) + + # --- Extract Age --- + if _is_english(lang): + # English resume: match "25 years old" or "Age: 25" + age_match = re.search(r'(?:Age)\s*[::]\s*(\d{1,2})', text, re.IGNORECASE) + if not age_match: + age_match = re.search(r'(\d{1,2})\s*years?\s*old', text, re.IGNORECASE) + if age_match: + resume["age_int"] = int(age_match.group(1)) + else: + # Chinese resume: match "25岁" + age_match = re.search(r"(\d{1,2})\s*岁", text) + if age_match: + resume["age_int"] = int(age_match.group(1)) + + # --- Extract Date of Birth --- + if _is_english(lang): + # English resume: match "1990-01-15" or "Jan 15, 1990" etc. + birth_match = re.search(r'(?:Birth|DOB|Date\s*of\s*Birth)\s*[::]\s*(.{6,20})', text, re.IGNORECASE) + if birth_match: + resume["birth_dt"] = birth_match.group(1).strip() + else: + birth_match = re.search(r"(19|20)\d{2}[-/]\d{1,2}[-/]\d{1,2}", text) + if birth_match: + resume["birth_dt"] = birth_match.group(0) + else: + # Chinese resume: match "1990年1月15日" or "1990-01-15" + birth_match = re.search(r"(19|20)\d{2}[年/-]\d{1,2}[月/-]\d{1,2}", text) + if birth_match: + resume["birth_dt"] = birth_match.group(0) + + # --- Extract Education Level --- + degree_keywords_zh = ["博士", "硕士", "本科", "大专", "专科", "高中", "MBA", "EMBA", "MPA"] + degree_keywords_en = ["PhD", "Master", "Bachelor", "Associate", "Diploma", "High School", + "MBA", "EMBA", "MPA", "Doctor"] + degree_keywords = degree_keywords_en if _is_english(lang) else degree_keywords_zh + found_degrees = [d for d in degree_keywords if d in text] + if found_degrees: + resume["degree_kwd"] = found_degrees + + # --- Extract School --- + if _is_english(lang): + # English resume: match "University/College/Institute/School" keywords + schools = re.findall( + r'([A-Z][A-Za-z\s\-&]{2,40}(?:University|College|Institute|School|Academy))', + text + ) + # Remove extra whitespace + schools = [re.sub(r'\s+', ' ', s).strip() for s in schools] + else: + # Chinese resume: match "XX大学/学院/职业技术学院" + schools = re.findall(r"[\u4e00-\u9fa5]{2,15}(?:大学|学院|职业技术学院)", text) + if schools: + resume["school_name_tks"] = list(set(schools)) + resume["first_school_name_tks"] = schools[0] + + # --- Extract Major --- + if _is_english(lang): + # English resume: match "Major: XXX" / "Field of Study: XXX" / "Specialization: XXX" + majors = re.findall( + r'(?:Major|Field\s*of\s*Study|Specialization|Concentration)\s*[::]\s*([A-Za-z\s\-&,]{2,40})', + text, re.IGNORECASE + ) + majors = [m.strip() for m in majors if m.strip()] + else: + # Chinese resume: match "专业:XXX" + majors = re.findall(r"专业[::]\s*([\u4e00-\u9fa5]{2,20})", text) + if majors: + resume["major_tks"] = majors + resume["first_major_tks"] = majors[0] + + # --- Extract Company Names --- + if _is_english(lang): + # English resume: match common company suffixes + en_company_patterns = [ + r'([A-Z][A-Za-z\s\-&,\.]{2,40}(?:Inc\.|Corp\.|Ltd\.|LLC|Co\.|Company|Group|Technologies|Technology|Solutions|Consulting|Services|Bank))', + ] + companies = [] + for pattern in en_company_patterns: + companies.extend(re.findall(pattern, text)) + companies = [re.sub(r'\s+', ' ', c).strip() for c in companies] + else: + # Chinese resume: match "XX有限公司" format + company_patterns = [ + r"[\u4e00-\u9fa5]{2,20}[((][\u4e00-\u9fa5]{2,10}[))](?:科技|信息技术|网络科技)?(?:股份)?有限公司", + r"[\u4e00-\u9fa5]{4,20}(?:科技|信息技术|网络科技|银行)?(?:股份)?有限公司", + ] + companies = [] + for pattern in company_patterns: + companies.extend(re.findall(pattern, text)) + + unique_companies = [] + seen = set() + # Filter verb list (bilingual) + filter_verbs = ( + ["completed", "conducted", "implemented", "responsible", "participated", "developed"] + if _is_english(lang) + else ["完成", "进行", "实施", "负责", "参与", "开发"] + ) + min_len = 3 if _is_english(lang) else 6 + for c in companies: + if len(c) < min_len or any(v in c.lower() for v in filter_verbs) or c in seen: + continue + is_sub = False + for existing in list(unique_companies): + if c in existing: + is_sub = True + break + if existing in c: + unique_companies.remove(existing) + seen.discard(existing) + if not is_sub: + unique_companies.append(c) + seen.add(c) + + if unique_companies: + resume["corp_nm_tks"] = unique_companies + resume["corporation_name_tks"] = unique_companies[0] + + # --- Extract Position (improved: context constraints to reduce noise) --- + if _is_english(lang): + # English resume: Strategy 1 - extract from "Title: XXX" / "Position: XXX" / "Role: XXX" format + position_label_matches = re.findall( + r'(?:Title|Position|Role|Job\s*Title)\s*[::]\s*([A-Za-z\s\-/&]{2,30})', + text, re.IGNORECASE + ) + positions = [p.strip() for p in position_label_matches if p.strip()] + + # English resume: Strategy 2 - match common position suffix keywords + en_position_suffixes = [ + "Engineer", "Manager", "Director", "Supervisor", "Specialist", + "Designer", "Consultant", "Assistant", "Architect", "Analyst", + "Developer", "Lead", "Officer", "Coordinator", "Administrator", + "Intern", "VP", "President", + ] + for line in lines: + if len(line) > 60: + continue # Skip overly long lines (usually description text) + for suffix in en_position_suffixes: + match = re.search(rf'([A-Za-z\s\-]{{1,25}}{suffix})\b', line, re.IGNORECASE) + if match: + pos = match.group(1).strip() + # Filter out matches that are clearly not positions (contain verbs) + filter_pos_verbs = ["responsible", "participated", "completed", "developed", "designed"] + if not any(v in pos.lower() for v in filter_pos_verbs) and len(pos) > 3: + positions.append(pos) + else: + # Chinese resume: Strategy 1 - extract from "职位/岗位:XXX" format + position_label_matches = re.findall( + r'(?:职位|岗位|职务|职称|担任)\s*[::]\s*([\u4e00-\u9fa5a-zA-Z]{2,15})', + text + ) + positions = list(position_label_matches) + + # Chinese resume: Strategy 2 - extract from work experience paragraphs (company name followed by position) + for line in lines: + pos_match = re.search( + r'(?:有限公司|集团|银行)\s+([\u4e00-\u9fa5]{2,8}(?:工程师|经理|总监|主管|专员|设计师|顾问|助理|架构师|分析师|运营|产品))', + line + ) + if pos_match: + positions.append(pos_match.group(1)) + + # Chinese resume: Strategy 3 - position keywords in standalone lines (length-limited to avoid matching description text) + position_suffixes = ["工程师", "经理", "总监", "主管", "专员", "设计师", "顾问", + "助理", "架构师", "分析师", "开发者", "负责人"] + for line in lines: + if len(line) > 20: + continue # Skip overly long lines + for suffix in position_suffixes: + match = re.search(rf'([\u4e00-\u9fa5]{{1,6}}{suffix})', line) + if match: + pos = match.group(1) + if not any(v in pos for v in ["负责", "参与", "完成", "开发了", "设计了"]): + positions.append(pos) + + if positions: + # Deduplicate while preserving order + seen_pos = set() + unique_positions = [] + for p in positions: + if p not in seen_pos: + seen_pos.add(p) + unique_positions.append(p) + resume["position_name_tks"] = unique_positions + + # --- Extract Years of Experience --- + if _is_english(lang): + # English resume: match "5 years experience" / "5+ years of experience" + work_exp_match = re.search(r'(\d+)\+?\s*years?\s*(?:of\s*)?(?:experience|work)', text, re.IGNORECASE) + if work_exp_match: + resume["work_exp_flt"] = float(work_exp_match.group(1)) + else: + # Chinese resume: match "5年...经验" + work_exp_match = re.search(r"(\d+)\s*年.*?经验", text) + if work_exp_match: + resume["work_exp_flt"] = float(work_exp_match.group(1)) + + # --- Extract Graduation Year --- + if _is_english(lang): + # English resume: match "Graduated 2020" / "Graduation: 2020" / "Class of 2020" + grad_match = re.search(r'(?:Graduat(?:ed|ion)|Class\s*of)\s*[::]?\s*((?:19|20)\d{2})', text, re.IGNORECASE) + if grad_match: + resume["edu_end_int"] = int(grad_match.group(1)) + else: + # Chinese resume: match "2020年...毕业" + grad_match = re.search(r"((?:19|20)\d{2})\s*年.*?毕业", text) + if grad_match: + resume["edu_end_int"] = int(grad_match.group(1)) + + if "name_kwd" not in resume: + resume["name_kwd"] = "Unknown" if _is_english(lang) else "未知" + + return resume + + + +# ==================== Phase 4: Post-processing Pipeline ==================== + + +def _postprocess_resume(resume: dict, lines: list[str], lang: str = "Chinese") -> dict: + """ + Four-phase post-processing pipeline (ref: SmartResume Section 3.2.3) + + 1. Source text validation: check if key fields can be found in the original text + 2. Domain normalization: standardize date formats, clean company name suffix noise + 3. Contextual deduplication: remove duplicate company/school entries + 4. Field completion: ensure all required fields exist + + Args: + resume: Raw resume dictionary extracted by LLM + lines: Original line text list (for source text validation) + lang: Language parameter, default "Chinese" + Returns: + Post-processed resume dictionary + """ + _en = _is_english(lang) + full_text = "\n".join(lines) if lines else "" + # Normalize full text for comparison (ref: SmartResume _validate_fields_in_text) + norm_full_text = _normalize_for_comparison(full_text) + + # --- Phase 1: Source text validation (prune hallucinations, ref: SmartResume _validate_fields_in_text) --- + # Name validation: clear if not found in source text (SmartResume strategy: discard hallucinated fields) + _unknown_names = ("未知", "Unknown") + if resume.get("name_kwd") and resume["name_kwd"] not in _unknown_names: + norm_name = _normalize_for_comparison(resume["name_kwd"]) + if norm_full_text and norm_name and norm_name not in norm_full_text: + logger.warning(f"Name '{resume['name_kwd']}' not found in source text, classified as LLM hallucination, cleared") + resume["name_kwd"] = "" + + # Validate company names (strict matching: full name must appear in source text, no longer using loose 4-char prefix matching) + if resume.get("corp_nm_tks") and norm_full_text: + verified_companies = [] + for company in resume["corp_nm_tks"]: + norm_company = _normalize_for_comparison(company) + if norm_company and norm_company in norm_full_text: + verified_companies.append(company) + else: + logger.debug(f"Company '{company}' not found in source text, filtered out") + # Update even if all filtered out (SmartResume strategy: prefer missing over wrong) + resume["corp_nm_tks"] = verified_companies + if verified_companies: + resume["corporation_name_tks"] = verified_companies[0] + else: + resume["corporation_name_tks"] = "" + + # Validate school names (ref: SmartResume _validate_fields_in_text) + if resume.get("school_name_tks") and norm_full_text: + verified_schools = [] + for school in resume["school_name_tks"]: + norm_school = _normalize_for_comparison(school) + if norm_school and norm_school in norm_full_text: + verified_schools.append(school) + else: + logger.debug(f"School '{school}' not found in source text, filtered out") + resume["school_name_tks"] = verified_schools + if verified_schools: + if resume.get("first_school_name_tks"): + # Ensure first_school is also in the verified list + if resume["first_school_name_tks"] not in verified_schools: + resume["first_school_name_tks"] = verified_schools[-1] + else: + resume["first_school_name_tks"] = "" + + # Validate position names + if resume.get("position_name_tks") and norm_full_text: + verified_positions = [] + for pos in resume["position_name_tks"]: + norm_pos = _normalize_for_comparison(pos) + if norm_pos and norm_pos in norm_full_text: + verified_positions.append(pos) + if verified_positions: + resume["position_name_tks"] = verified_positions + + # --- Phase 2: Domain normalization --- + # Standardize date format + if resume.get("birth_dt"): + resume["birth_dt"] = re.sub(r"[年月]", "-", str(resume["birth_dt"])).rstrip("-") + + # Clean non-digit characters from phone number (keep + sign) + if resume.get("phone_kwd"): + phone = re.sub(r"[^\d+]", "", str(resume["phone_kwd"])) + if phone: + resume["phone_kwd"] = phone + + # Standardize gender (output format determined by language parameter) + if resume.get("gender_kwd"): + gender = str(resume["gender_kwd"]).strip() + if gender in ("male", "Male", "M", "m", "男"): + resume["gender_kwd"] = "Male" if _en else "男" + elif gender in ("female", "Female", "F", "f", "女"): + resume["gender_kwd"] = "Female" if _en else "女" + + # --- Phase 3: Contextual deduplication --- + for list_field in ["corp_nm_tks", "school_name_tks", "major_tks", + "position_name_tks", "skill_tks"]: + if isinstance(resume.get(list_field), list): + # Order-preserving deduplication + seen = set() + deduped = [] + for item in resume[list_field]: + item_str = str(item).strip() + if item_str and item_str not in seen: + seen.add(item_str) + deduped.append(item_str) + resume[list_field] = deduped + + # --- Phase 4: Field completion --- + required_fields = [ + "name_kwd", "gender_kwd", "phone_kwd", "email_tks", + "position_name_tks", "school_name_tks", "major_tks", + ] + for field in required_fields: + if field not in resume: + if field.endswith("_tks"): + resume[field] = [] + elif field.endswith("_int") or field.endswith("_flt"): + resume[field] = 0 + else: + resume[field] = "" + + # Clean internal marker fields (already handled in Phase 1, this is a safety fallback) + resume.pop("_name_confidence", None) + + return resume + + +# ==================== Pipeline Orchestration & Chunk Construction ==================== + + +def parse_resume(filename: str, binary: bytes, tenant_id , lang: str = "Chinese") -> tuple[dict, list[str], list[dict]]: + """ + Resume parsing pipeline orchestration function + + Execution flow: + 1. Text extraction (dual-path fusion + layout reconstruction + line-number index) + 2. Parallel LLM structured extraction (three sub-tasks) + 3. Regex fallback parsing (when LLM fails) + 4. Four-phase post-processing + + Args: + filename: File name + binary: File binary content + lang: Language, default "Chinese" + Returns: + (resume, lines, line_positions) tuple: + - resume: Structured resume information dictionary + - lines: Original line text list (for chunk text matching and positioning) + - line_positions: Per-line coordinate info list (for writing chunk position_int fields) + """ + # Phase 1: Text extraction + indexed_text, lines, line_positions = extract_text(filename, binary) + if not indexed_text or not lines: + logger.warning(f"Text extraction returned empty: {filename}") + default_name = "Unknown" if _is_english(lang) else "未知" + return {"name_kwd": default_name}, [], [] + + # Phase 2: Parallel LLM structured extraction + resume = parse_with_llm(indexed_text, lines, tenant_id , lang) + + # Phase 3: Fallback to regex parsing when LLM fails + if not resume: + logger.info(f"LLM parsing failed, falling back to regex parsing: {filename}") + plain_text = "\n".join(lines) + resume = parse_with_regex(plain_text, lang) + + # Phase 4: Post-processing pipeline + resume = _postprocess_resume(resume, lines, lang) + + return resume, lines, line_positions + + +def _build_chunk_document(filename: str, resume: dict, + lang: str = "Chinese") -> list[dict]: + """ + Build a list of document chunks from structured resume information + + Each field generates an independent chunk containing tokenization results and metadata. + Compatible with the build_chunks flow in task_executor.py. + + Key design: Each chunk redundantly includes key identity fields (name, phone, email, etc.), + so that when any chunk is retrieved, the candidate's identity can be immediately identified. + The full resume can be fetched via doc_id to get all chunks for complete information. + + Args: + filename: File name + resume: Structured resume information dictionary + lang: Language parameter, default "Chinese" + Returns: + Document chunk list, each chunk contains content_with_weight, content_ltks, + position_int, page_num_int, top_int and other fields + """ + chunks = [] + # Get the corresponding field map version based on language parameter + field_map = get_field_map(lang) + doc = { + "docnm_kwd": filename, + "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)), + } + doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) + + # Extract key identity fields, redundantly written to each chunk + # These fields are small in size but high in information density; once retrieved, the candidate can be immediately identified + _IDENTITY_FIELDS = ("name_kwd", "phone_kwd", "email_tks", "gender_kwd", + "highest_degree_kwd", "work_exp_flt", "corporation_name_tks") + identity_meta = {} + for ik in _IDENTITY_FIELDS: + iv = resume.get(ik) + if not iv: + continue + if ik.endswith("_tks"): + identity_meta[ik] = rag_tokenizer.tokenize( + " ".join(iv) if isinstance(iv, list) else str(iv) + ) + elif ik.endswith("_kwd"): + identity_meta[ik] = iv if isinstance(iv, list) else str(iv) + elif ik.endswith("_flt"): + try: + identity_meta[ik] = float(iv) + except (ValueError, TypeError): + pass + else: + identity_meta[ik] = str(iv) + + # Build resume summary text, appended to each chunk's content to improve semantic retrieval recall + summary_parts = [] + _en = _is_english(lang) + if resume.get("name_kwd"): + summary_parts.append(f"{'Name' if _en else '姓名'}:{resume['name_kwd']}") + if resume.get("phone_kwd"): + summary_parts.append(f"{'Phone' if _en else '电话'}:{resume['phone_kwd']}") + if resume.get("corporation_name_tks"): + corp = resume["corporation_name_tks"] + summary_parts.append(f"{'Company' if _en else '公司'}:{corp if isinstance(corp, str) else ' '.join(corp)}") + if resume.get("highest_degree_kwd"): + summary_parts.append(f"{'Degree' if _en else '学历'}:{resume['highest_degree_kwd']}") + if resume.get("work_exp_flt"): + if _en: + summary_parts.append(f"Experience:{resume['work_exp_flt']}yrs") + else: + summary_parts.append(f"经验:{resume['work_exp_flt']}年") + resume_summary = " | ".join(summary_parts) if summary_parts else "" + + # List fields that need per-element splitting (each experience/project generates a separate chunk to avoid oversized merged chunks) + _SPLIT_LIST_FIELDS = {"work_desc_tks", "project_desc_tks"} + + # Basic info field set: these fields should be merged into one chunk to avoid splitting name, phone, email, etc. + _BASIC_INFO_FIELDS = { + "name_kwd", "name_pinyin_kwd", "gender_kwd", "age_int", + "phone_kwd", "email_tks", "birth_dt", "work_exp_flt", + "position_name_tks", "expect_city_names_tks", + "expect_position_name_tks", + } + + # Education field set: degree, school, major, tags, etc. should be merged into one chunk + _EDUCATION_FIELDS = { + "first_school_name_tks", "first_degree_kwd", "highest_degree_kwd", + "first_major_tks", "edu_first_fea_kwd", "degree_kwd", "major_tks", + "school_name_tks", "sch_rank_kwd", "edu_fea_kwd", "edu_end_int", + } + + # Skills & certificates field set: skills, languages, certificates are small, merge into one chunk + _SKILL_CERT_FIELDS = { + "skill_tks", "language_tks", "certificate_tks", + } + + # Work overview field set: company list, industry, most recent company merged into one chunk + _WORK_OVERVIEW_FIELDS = { + "corporation_name_tks", "corp_nm_tks", "industry_name_tks", + } + + # All merge groups: (field_set, group_title) tuple list + _MERGE_GROUPS = [ + (_BASIC_INFO_FIELDS, "Basic Info" if _en else "基本信息"), + (_EDUCATION_FIELDS, "Education" if _en else "教育背景"), + (_SKILL_CERT_FIELDS, "Skills & Certificates" if _en else "技能与证书"), + (_WORK_OVERVIEW_FIELDS, "Work Overview" if _en else "工作概况"), + ] + + # Collect all fields that need merge processing; skip them during individual iteration + _ALL_MERGED_FIELDS = set() + for fields_set, _ in _MERGE_GROUPS: + _ALL_MERGED_FIELDS.update(fields_set) + + # Merge fields by group, generating one chunk per group + for fields_set, group_title in _MERGE_GROUPS: + group_parts = [] + group_field_values = {} # Store structured values for each field, to be written into chunk + for field_key in field_map: + if field_key not in fields_set: + continue + value = resume.get(field_key) + if not value: + continue + field_desc = field_map[field_key] + if isinstance(value, list): + text_value = " ".join(str(v) for v in value if v) + else: + text_value = str(value) + if not text_value.strip(): + continue + group_parts.append(f"{field_desc}: {text_value}") + group_field_values[field_key] = value + + if not group_parts: + continue + + content = f"{group_title}\n" + "\n".join(group_parts) + if resume_summary: + content += f"\n[{resume_summary}]" + chunk = { + "content_with_weight": content, + "content_ltks": rag_tokenizer.tokenize(content), + "content_sm_ltks": rag_tokenizer.fine_grained_tokenize( + rag_tokenizer.tokenize(content) + ), + } + chunk.update(doc) + # Redundantly write identity fields + for mk, mv in identity_meta.items(): + chunk[mk] = mv + # Write each field's structured value into chunk (for structured retrieval) + for fk, fv in group_field_values.items(): + if fk.endswith("_tks"): + text_val = " ".join(str(v) for v in fv) if isinstance(fv, list) else str(fv) + chunk[fk] = rag_tokenizer.tokenize(text_val) + elif fk.endswith("_kwd"): + chunk[fk] = fv if isinstance(fv, list) else str(fv) + elif fk.endswith("_int"): + try: + chunk[fk] = int(fv) + except (ValueError, TypeError): + pass + elif fk.endswith("_flt"): + try: + chunk[fk] = float(fv) + except (ValueError, TypeError): + pass + else: + chunk[fk] = str(fv) + chunks.append(chunk) + + # Iterate over field map, generating a chunk for each non-merged field with a value + for field_key, field_desc in field_map.items(): + # Skip fields already processed in merge groups + if field_key in _ALL_MERGED_FIELDS: + continue + value = resume.get(field_key) + if not value: + continue + + # For work/project descriptions (long text lists), split into multiple chunks per element + if field_key in _SPLIT_LIST_FIELDS and isinstance(value, list): + # Get company name list to add context to each work description + corp_list = resume.get("corp_nm_tks", []) if field_key == "work_desc_tks" else [] + project_list = resume.get("project_tks", []) if field_key == "project_desc_tks" else [] + # Get detailed info for each work experience entry (time period, years) + work_details = resume.get("_work_exp_details", []) if field_key == "work_desc_tks" else [] + + for idx, item in enumerate(value): + item_text = str(item).strip() + if not item_text: + continue + + # Add company/project name prefix to each description for context + if field_key == "work_desc_tks" and idx < len(work_details): + # Use detailed info to build prefix, including company, time range, years + detail = work_details[idx] + company = detail.get("company", "") + start_d = detail.get("start_date", "") + end_d = detail.get("end_date", "") + years = detail.get("years", 0) + # Build time range text + time_parts = [] + if start_d: + time_range = f"{start_d}-{end_d}" if end_d else str(start_d) + time_parts.append(time_range) + if years > 0: + time_parts.append(f"{years}{'yrs' if _en else '年'}") + time_text = " ".join(time_parts) + if company and time_text: + content_prefix = f"{field_desc}({company} {time_text})" + elif company: + content_prefix = f"{field_desc}({company})" + else: + content_prefix = f"{field_desc}({'#' if _en else '第'}{idx + 1}{'' if _en else '段'})" + elif field_key == "work_desc_tks" and idx < len(corp_list): + content_prefix = f"{field_desc}({corp_list[idx]})" + elif field_key == "project_desc_tks" and idx < len(project_list): + content_prefix = f"{field_desc}({project_list[idx]})" + else: + content_prefix = f"{field_desc}({'#' if _en else '第'}{idx + 1}{'' if _en else '段'})" + + if resume_summary: + content = f"{content_prefix}: {item_text}\n[{resume_summary}]" + else: + content = f"{content_prefix}: {item_text}" + + chunk = { + "content_with_weight": content, + "content_ltks": rag_tokenizer.tokenize(content), + "content_sm_ltks": rag_tokenizer.fine_grained_tokenize( + rag_tokenizer.tokenize(content) + ), + } + chunk.update(doc) + + # Redundantly write identity fields + for mk, mv in identity_meta.items(): + if mk != field_key: + chunk[mk] = mv + + # Tokenization result for current segment + chunk[field_key] = rag_tokenizer.tokenize(item_text) + chunks.append(chunk) + continue + + # Merge list values into text + if isinstance(value, list): + text_value = " ".join(str(v) for v in value if v) + else: + text_value = str(value) + + if not text_value.strip(): + continue + + # Build chunk content: "field_desc: field_value", append summary for semantic association + if resume_summary and field_key not in ("name_kwd", "phone_kwd"): + content = f"{field_desc}: {text_value}\n[{resume_summary}]" + else: + content = f"{field_desc}: {text_value}" + + chunk = { + "content_with_weight": content, + "content_ltks": rag_tokenizer.tokenize(content), + "content_sm_ltks": rag_tokenizer.fine_grained_tokenize( + rag_tokenizer.tokenize(content) + ), + } + chunk.update(doc) + + # Redundantly write identity fields (do not overwrite the current field's own value) + for mk, mv in identity_meta.items(): + if mk != field_key: + chunk[mk] = mv + + # Write resume field value into the chunk's corresponding field (for structured retrieval) + if field_key.endswith("_tks"): + chunk[field_key] = rag_tokenizer.tokenize(text_value) + elif field_key.endswith("_kwd"): + if isinstance(value, list): + chunk[field_key] = value + else: + chunk[field_key] = text_value + elif field_key.endswith("_int"): + try: + chunk[field_key] = int(value) + except (ValueError, TypeError): + pass + elif field_key.endswith("_flt"): + try: + chunk[field_key] = float(value) + except (ValueError, TypeError): + pass + else: + chunk[field_key] = text_value + + chunks.append(chunk) + + # If no chunks were generated, create at least one chunk containing the name + if not chunks: + name = resume.get("name_kwd", "Unknown" if _en else "未知") + content = f"{'Name' if _en else '姓名'}: {name}" + chunk = { + "content_with_weight": content, + "content_ltks": rag_tokenizer.tokenize(content), + "content_sm_ltks": rag_tokenizer.fine_grained_tokenize( + rag_tokenizer.tokenize(content) + ), + } + chunk.update(doc) + chunks.append(chunk) + + # Write coordinate info to each chunk (position_int, page_num_int, top_int) + # + # Resume chunks are split by semantic fields (basic info, education, work description, etc.), + # not by PDF physical regions. Field values may be scattered across multiple locations in the PDF, + # and using text matching to reverse-lookup coordinates would cause disordered sorting. + # + # Therefore, assign incrementing coordinates based on chunk generation order (i.e., semantic logical order), + # ensuring display order: basic info -> education -> skills/certs -> work overview -> work desc -> project desc... + # + # add_positions input format: [(page, left, right, top, bottom), ...] + # - page starts from 0, function internally stores +1 + # - task_executor sorts by page_num_int and top_int (page first, then Y coordinate) + from rag.nlp import add_positions + + for i, ck in enumerate(chunks): + # All chunks placed on page=0, top increments by index to ensure logical ordering + add_positions(ck, [[0, 0, 0, i, i]]) + + return chunks + +def _blackout_text_regions(image: "np.ndarray", meta_blocks: list[dict], page_idx: int, + pdf_to_img_scale: float) -> "np.ndarray": + """ + Black out metadata-extracted text regions on the page image to prevent OCR duplication. + + Ref: SmartResume blackout strategy — extract metadata text first, black out those regions, + then run OCR on the blacked-out image so it only recognizes content metadata missed. + More reliable than IoU-based deduplication. + + Args: + image: Page image (numpy array) + meta_blocks: Text blocks from metadata extraction + page_idx: Current page number + pdf_to_img_scale: Scale factor from PDF coordinates to image coordinates + Returns: + Image with text regions blacked out + """ + import cv2 + blacked = image.copy() + page_blocks = [b for b in meta_blocks if b.get("page") == page_idx] + # Draw filled black rectangles over each metadata text block + padding = 2 # Extra pixels to ensure full coverage + for b in page_blocks: + x0 = int(b["x0"] * pdf_to_img_scale) - padding + y0 = int(b["top"] * pdf_to_img_scale) - padding + x1 = int(b["x1"] * pdf_to_img_scale) + padding + y1 = int(b["bottom"] * pdf_to_img_scale) + padding + # Clamp to image boundaries + x0 = max(0, x0) + y0 = max(0, y0) + x1 = min(blacked.shape[1], x1) + y1 = min(blacked.shape[0], y1) + cv2.rectangle(blacked, (x0, y0), (x1, y1), (0, 0, 0), -1) + return blacked + + + +def chunk(filename, binary, tenant_id, from_page=0, to_page=100000, + lang="Chinese", callback=None, **kwargs): + """ + Resume parsing entry function (compatible with task_executor.py) + + This function is the entry point registered as FACTORY[ParserType.RESUME.value], + with a signature consistent with other parsers (e.g., naive.chunk). + + Args: + filename: File name + binary: File binary content + from_page: Start page number (not used in resume parsing) + to_page: End page number (not used in resume parsing) + lang: Language, default "Chinese" + callback: Progress callback function, accepts (progress, message) parameters + **kwargs: Other parameters (parser_config, kb_id, tenant_id, etc.) + Returns: + Document chunk list + """ + if callback is None: + def callback(prog, msg): return None + + try: + callback(0.1, "Starting resume parsing...") + + # Parse resume + resume, lines, line_positions = parse_resume(filename, binary, tenant_id , lang) + callback(0.6, "Resume structured extraction complete") + + # Build document chunks (with coordinate info) + chunks = _build_chunk_document(filename, resume, lang) + callback(0.9, f"Document chunk construction complete, {len(chunks)} chunks total") + + callback(1.0, "Resume parsing complete") + return chunks + + except Exception as e: + logger.exception(f"Resume parsing exception: {filename}") + callback(-1, f"Resume parsing failed: {str(e)}") + return [] + + +def _resort_page_with_layout(page_blocks: list[dict], layout_regions: list[dict]) -> list[dict]: + if not page_blocks: + return [] + + if not layout_regions: + return sorted(page_blocks, key=lambda b: ( + (b.get("top", 0) + b.get("bottom", 0)) / 2, + (b.get("x0", 0) + b.get("x1", 0)) / 2, + )) + + type_groups: dict[str, list] = {} + for lt in layout_regions: + tp = lt.get("type", "") + type_groups.setdefault(tp, []).append(lt) + entries = [] + for tp, group in type_groups.items(): + for idx, lt in enumerate(group): + key = f"{tp}-{idx}" + x0, x1 = lt.get("x0", 0), lt.get("x1", 0) + top, bottom = lt.get("top", 0), lt.get("bottom", 0) + entries.append({ + "key": key, "type": tp, + "x0": x0, "top": top, "x1": x1, "bottom": bottom, + "cy": (top + bottom) / 2, "cx": (x0 + x1) / 2, + }) + + for b in page_blocks: + if b.get("layoutno"): + continue + b_cx = (b.get("x0", 0) + b.get("x1", 0)) / 2 + b_cy = (b.get("top", 0) + b.get("bottom", 0)) / 2 + for entry in entries: + if (entry["x0"] <= b_cx <= entry["x1"] + and entry["top"] <= b_cy <= entry["bottom"]): + b["layoutno"] = entry["key"] + b["layout_type"] = entry["type"] + break + + for entry in entries: + layout_key = entry["key"] + layout_area = (entry["x1"] - entry["x0"]) * (entry["bottom"] - entry["top"]) + if layout_area <= 0: + continue + layout_blocks = [b for b in page_blocks if b.get("layoutno") == layout_key] + if not layout_blocks: + continue + text_total_area = sum( + (b.get("x1", 0) - b.get("x0", 0)) * (b.get("bottom", 0) - b.get("top", 0)) + for b in layout_blocks + ) + if text_total_area / layout_area < 0.075: + for b in layout_blocks: + b["layoutno"] = "" + b["layout_type"] = "" + + entry_map = {e["key"]: e for e in entries} + for b in page_blocks: + b_cx = (b.get("x0", 0) + b.get("x1", 0)) / 2 + b_cy = (b.get("top", 0) + b.get("bottom", 0)) / 2 + b["_x_center"] = b_cx + b["_y_center"] = b_cy + layoutno = b.get("layoutno", "") + if layoutno and layoutno in entry_map: + b["_lx_center"] = entry_map[layoutno]["cx"] + b["_ly_center"] = entry_map[layoutno]["cy"] + else: + b["_lx_center"] = b_cx + b["_ly_center"] = b_cy + + active_keys = {b.get("layoutno") for b in page_blocks if b.get("layoutno")} + active_entries = [e for e in entries if e["key"] in active_keys] + + for b in page_blocks: + if b.get("layoutno"): + continue + if not active_entries: + continue + b_cx, b_cy = b["_x_center"], b["_y_center"] + min_dist = float("inf") + best_cx, best_cy = b_cx, b_cy + for ae in active_entries: + lx1, ly1, lx2, ly2 = ae["x0"], ae["top"], ae["x1"], ae["bottom"] + if b_cy < ly1: + dy = ly1 - b_cy + elif b_cy > ly2: + dy = b_cy - ly2 + else: + dy = 0 + if b_cx < lx1: + dx = lx1 - b_cx + elif b_cx > lx2: + dx = b_cx - lx2 + else: + dx = 0 + dist = (dx ** 2 + dy ** 2) ** 0.5 + if dist < min_dist: + min_dist = dist + best_cx, best_cy = ae["cx"], ae["cy"] + b["_lx_center"] = best_cx + b["_ly_center"] = best_cy + + sorted_blocks = sorted(page_blocks, key=lambda b: ( + b.get("_ly_center", 0), + b.get("_lx_center", 0), + b.get("_y_center", 0), + b.get("_x_center", 0), + )) + + for b in sorted_blocks: + b.pop("_ly_center", None) + b.pop("_lx_center", None) + b.pop("_y_center", None) + b.pop("_x_center", None) + + return sorted_blocks + + +def _layout_detect_reorder(blocks: list[dict], binary: bytes) -> list[dict]: + if not blocks: + return blocks + + recognizer = _get_layout_recognizer() + if recognizer is None: + logger.info("Layout detector unavailable, falling back to heuristic sorting") + return _layout_aware_reorder(blocks) + + try: + import pdfplumber + pages_blocks: dict[int, list[dict]] = {} + for b in blocks: + pg = b.get("page", 0) + pages_blocks.setdefault(pg, []).append(b) + + page_indices = sorted(pages_blocks.keys()) + image_list = [] + ocr_res_per_page = [] + + with pdfplumber.open(BytesIO(binary)) as pdf: + for pg in page_indices: + if pg >= len(pdf.pages): + continue + page = pdf.pages[pg] + pil_img = page.to_image(resolution=72 * 3).annotated + image_list.append(pil_img) + + page_bxs = [] + for b in pages_blocks[pg]: + page_bxs.append({ + "x0": float(b["x0"]), + "top": float(b["top"]), + "x1": float(b["x1"]), + "bottom": float(b["bottom"]), + "text": b["text"], + "page": pg, + }) + ocr_res_per_page.append(page_bxs) + + if not image_list: + return _layout_aware_reorder(blocks) + + tagged_blocks, page_layouts = recognizer( + image_list, ocr_res_per_page, scale_factor=3, thr=0.2, drop=False + ) + + if not tagged_blocks: + logger.warning("Layout detector unavailable, falling back to heuristic sorting") + return _layout_aware_reorder(blocks) + + tagged_per_page: dict[int, list[dict]] = {} + for b in tagged_blocks: + pg = b.get("page", 0) + tagged_per_page.setdefault(pg, []).append(b) + + sorted_all = [] + total_layout_count = 0 + for pn, pg in enumerate(page_indices): + page_bxs = tagged_per_page.get(pg, []) + lts = page_layouts[pn] if pn < len(page_layouts) else [] + total_layout_count += len(lts) + sorted_page = _resort_page_with_layout(page_bxs, lts) + sorted_all.extend(sorted_page) + + for b in sorted_all: + if "page" not in b: + b["page"] = 0 + + logger.info(f"YOLOv10 detector completed, {len(sorted_all)} total chunks," + f"checked {total_layout_count} layout") + return sorted_all + + except Exception as e: + logger.warning(f"Layout detector unavailable, falling back to heuristic sorting: {e}") + return _layout_aware_reorder(blocks) diff --git a/rag/prompts/resume_basic_info.md b/rag/prompts/resume_basic_info.md new file mode 100644 index 0000000000..7a3756813d --- /dev/null +++ b/rag/prompts/resume_basic_info.md @@ -0,0 +1,39 @@ +请从以下带行号索引的简历文本中提取基本信息。 + +{indexed_text} + +提取如下信息到 JSON,若某些字段不存在则输出 "" 空或 0: +{{ + "name_kwd": "", + "gender_kwd": "", + "age_int": 0, + "phone_kwd": "", + "email_tks": "", + "birth_dt": "", + "work_exp_flt": 0, + "current_location": "", + "expect_city_names_tks": [], + "expect_position_name_tks": [], + "skill_tks": [], + "language_tks": [], + "certificate_tks": [], + "self_evaluation_tks": "" +}} + +字段说明: +- name_kwd: 姓名,如"张三" +- gender_kwd: 男/女,若不存在则不填 +- age_int: 当前年龄,整数 +- phone_kwd: 电话/手机,请保留原文中的形式,保留国家码区号括号 +- email_tks: 邮箱,如 "xxx@qq.com" +- birth_dt: 出生年月,如 "1996-11" +- work_exp_flt: 工作年限,浮点数 +- current_location: 现居地/当前城市,不要从工作经历中推测,要写明现居地 +- expect_city_names_tks: 期望工作城市列表,简历中需要明确说明是期望城市 +- expect_position_name_tks: 期望职位列表 +- skill_tks: 技能/技术栈列表 +- language_tks: 语言能力列表 +- certificate_tks: 证书/资质列表 +- self_evaluation_tks: 自我评价/个人优势/个人总结,完整提取原文内容 + +只返回 JSON。 /no_think \ No newline at end of file diff --git a/rag/prompts/resume_basic_info_en.md b/rag/prompts/resume_basic_info_en.md new file mode 100644 index 0000000000..7ea6dd0bc8 --- /dev/null +++ b/rag/prompts/resume_basic_info_en.md @@ -0,0 +1,39 @@ +Please extract basic information from the following line-indexed resume text. + +{indexed_text} + +Extract the following information into JSON. If a field does not exist, output "" or 0: +{{ + "name_kwd": "", + "gender_kwd": "", + "age_int": 0, + "phone_kwd": "", + "email_tks": "", + "birth_dt": "", + "work_exp_flt": 0, + "current_location": "", + "expect_city_names_tks": [], + "expect_position_name_tks": [], + "skill_tks": [], + "language_tks": [], + "certificate_tks": [], + "self_evaluation_tks": "" +}} + +Field descriptions: +- name_kwd: Full name, e.g. "John Smith" +- gender_kwd: Male/Female, leave empty if not present +- age_int: Current age, integer +- phone_kwd: Phone number, keep original format including country code and brackets +- email_tks: Email address, e.g. "xxx@gmail.com" +- birth_dt: Date of birth, e.g. "1996-11" +- work_exp_flt: Years of work experience, float +- current_location: Current city/location, do not infer from work experience, must be explicitly stated +- expect_city_names_tks: List of preferred work cities, must be explicitly stated in the resume +- expect_position_name_tks: List of desired positions +- skill_tks: List of skills/tech stack +- language_tks: List of language proficiencies +- certificate_tks: List of certificates/qualifications +- self_evaluation_tks: Self-evaluation/personal strengths/summary, extract full original text + +Return JSON only. /no_think \ No newline at end of file diff --git a/rag/prompts/resume_education.md b/rag/prompts/resume_education.md new file mode 100644 index 0000000000..95ff8eb4d6 --- /dev/null +++ b/rag/prompts/resume_education.md @@ -0,0 +1,31 @@ +请从以下带行号索引的简历文本中提取教育背景。 + +{indexed_text} + +提取为 JSON: +{{ + "education": [ + {{ + "school": "", + "major": "", + "degree": "", + "department": "", + "start_date": "", + "end_date": "", + "desc_lines": [start_index, end_index] + }} + ] +}} + +字段说明: +- school: 学校全称,如"厦门大学",中英文都可以 +- major: 专业,如"机械工程" +- degree: 学位,本科/硕士/博士/专科/高中/初中,若不存在则填"" +- department: 系/学院,如"信息工程系" +- start_date: 开始时间,格式为 %Y.%m 或 %Y +- end_date: 结束时间,若至今填写"至今",若不存在填写"" +- desc_lines: [起始行号, 结束行号],教育描述对应的行号范围(可选) + - 包括课程成绩、研究方向、GPA、荣誉奖项等 + - 不存在则填 [] + +只返回 JSON。 /no_think \ No newline at end of file diff --git a/rag/prompts/resume_education_en.md b/rag/prompts/resume_education_en.md new file mode 100644 index 0000000000..9d726b48b4 --- /dev/null +++ b/rag/prompts/resume_education_en.md @@ -0,0 +1,31 @@ +Please extract education background from the following line-indexed resume text. + +{indexed_text} + +Extract into JSON: +{{ + "education": [ + {{ + "school": "", + "major": "", + "degree": "", + "department": "", + "start_date": "", + "end_date": "", + "desc_lines": [start_index, end_index] + }} + ] +}} + +Field descriptions: +- school: Full school name, e.g. "Stanford University", both Chinese and English are acceptable +- major: Major/field of study, e.g. "Computer Science" +- degree: Degree level - Bachelor/Master/PhD/Associate/High School/Middle School, leave "" if not available +- department: Department/College, e.g. "School of Engineering" +- start_date: Start date, format %Y.%m or %Y +- end_date: End date, use "Present" if still enrolled, "" if not available +- desc_lines: [start_line, end_line], line number range for education description (optional) + - Includes coursework, research focus, GPA, honors/awards, etc. + - Use [] if not available + +Return JSON only. /no_think \ No newline at end of file diff --git a/rag/prompts/resume_project_exp.md b/rag/prompts/resume_project_exp.md new file mode 100644 index 0000000000..ed216deaba --- /dev/null +++ b/rag/prompts/resume_project_exp.md @@ -0,0 +1,31 @@ +请从以下带行号索引的简历文本中提取项目经验。 + +{indexed_text} + +提取为 JSON,每段项目经验包含: +{{ + "projectExperience": [ + {{ + "project_name": "", + "role": "", + "start_date": "", + "end_date": "", + "desc_lines": [start_index, end_index] + }} + ] +}} + +字段说明: +- project_name: 项目名称 +- role: 担任角色/职责,如"项目负责人"、"后端开发" +- start_date: 开始时间,格式为 %Y.%m 或 %Y +- end_date: 结束时间,若至今填写"至今",若不存在填写"" +- desc_lines: [起始行号, 结束行号],项目描述对应的行号范围(整数数组) + - 指项目描述的原文引用段落 index 范围,包括项目内容、技术栈、成果等 + - 不包括 project_name、role、start_date、end_date 所在行 + - 尽可能写全,直到下一段项目经验或其他段落标题为止 + - 遇到以下段落标题时必须截止,不要将其包含在 desc_lines 中: + 个人评价、自我评价、个人总结、个人优势、自我描述、技能特长、专业技能、教育背景、教育经历、工作经历、工作经验、证书资质、语言能力、兴趣爱好、求职意向 + - 如果不存在就写 [] + +只返回 JSON。 /no_think \ No newline at end of file diff --git a/rag/prompts/resume_project_exp_en.md b/rag/prompts/resume_project_exp_en.md new file mode 100644 index 0000000000..e33de88e5c --- /dev/null +++ b/rag/prompts/resume_project_exp_en.md @@ -0,0 +1,31 @@ +Please extract project experience from the following line-indexed resume text. + +{indexed_text} + +Extract into JSON, each project experience entry contains: +{{ + "projectExperience": [ + {{ + "project_name": "", + "role": "", + "start_date": "", + "end_date": "", + "desc_lines": [start_index, end_index] + }} + ] +}} + +Field descriptions: +- project_name: Project name +- role: Role/responsibility, e.g. "Project Lead", "Backend Developer" +- start_date: Start date, format %Y.%m or %Y +- end_date: End date, use "Present" if ongoing, "" if not available +- desc_lines: [start_line, end_line], line number range for project description (integer array) + - Refers to the original text reference range for project description, including project content, tech stack, achievements, etc. + - Does not include lines containing project_name, role, start_date, end_date + - Include as much as possible until the next project experience entry or other section heading + - STOP before these section headings (do not include them in desc_lines): + Self-evaluation, Personal Summary, Skills, Technical Skills, Education, Work Experience, Certificates, Languages, Hobbies, Career Objective + - Use [] if not available + +Return JSON only. /no_think \ No newline at end of file diff --git a/rag/prompts/resume_system.md b/rag/prompts/resume_system.md new file mode 100644 index 0000000000..9b3419f41e --- /dev/null +++ b/rag/prompts/resume_system.md @@ -0,0 +1,3 @@ +你是一个专业的简历分析助手。你的任务是将给定的简历文本转换为 JSON 输出。 +(如果有中英文简历同时出现时,只关注中文简历) +严格按照 JSON 格式返回结果,不要有任何其他文字。 \ No newline at end of file diff --git a/rag/prompts/resume_system_en.md b/rag/prompts/resume_system_en.md new file mode 100644 index 0000000000..8d02488f26 --- /dev/null +++ b/rag/prompts/resume_system_en.md @@ -0,0 +1,3 @@ +You are a professional resume analysis assistant. Your task is to convert the given resume text into JSON output. +(If both Chinese and English resumes appear, focus only on the English resume) +Strictly return results in JSON format without any other text. \ No newline at end of file diff --git a/rag/prompts/resume_work_exp.md b/rag/prompts/resume_work_exp.md new file mode 100644 index 0000000000..2a7465c16e --- /dev/null +++ b/rag/prompts/resume_work_exp.md @@ -0,0 +1,39 @@ +请从以下带行号索引的简历文本中提取工作经历。 + +{indexed_text} + +提取为 JSON,每段工作经历包含: +{{ + "workExperience": [ + {{ + "company": "", + "position": "", + "internship": 0, + "start_date": "", + "end_date": "", + "desc_lines": [start_index, end_index] + }} + ] +}} + +字段说明: +- company: 公司全称(含括号内地区信息),如"阿里巴巴(中国)有限公司" +- position: 职位名称,遵循原文不要编造或推测 +- internship: 该段经历是否是实习,是实习为1,不是为0 +- start_date: 入职时间,格式为 %Y.%m 或 %Y,如 "2024.1" +- end_date: 离职时间,若至今填写"至今",若不存在填写"" +- desc_lines: [起始行号, 结束行号],工作描述对应的行号范围(整数数组) + - 指工作经历描述的原文引用段落 index 范围,包括工作成果、业绩、主要工作、技术栈等 + - 不包括 company、position、start_date、end_date 所在行 + - 尽可能写全,直到下一段工作经历或其他段落标题为止 + - 遇到以下段落标题时必须截止,不要将其包含在 desc_lines 中: + 个人评价、自我评价、个人总结、个人优势、自我描述、技能特长、专业技能、教育背景、教育经历、项目经验、项目经历、证书资质、语言能力、兴趣爱好、求职意向 + - 如果不存在就写 [] + +示例: +[22]: 阿里巴巴 2021.11-2022.11 高级工程师 +[23]: 工作描述: 从事地推工作完成xx业绩 +[24]: 在地推任务中考核为A +则 desc_lines 应为 [23, 24] + +只返回 JSON。 /no_think \ No newline at end of file diff --git a/rag/prompts/resume_work_exp_en.md b/rag/prompts/resume_work_exp_en.md new file mode 100644 index 0000000000..46e4c9ac8b --- /dev/null +++ b/rag/prompts/resume_work_exp_en.md @@ -0,0 +1,38 @@ +Please extract work experience from the following line-indexed resume text. + +{indexed_text} + +Extract into JSON, each work experience entry contains: +{{ + "workExperience": [ + {{ + "company": "", + "position": "", + "internship": 0, + "start_date": "", + "end_date": "", + "desc_lines": [start_index, end_index] + }} + ] +}} + +Field descriptions: +- company: Full company name (including region info in brackets), e.g. "Google Inc." +- position: Job title, follow original text, do not fabricate or guess +- internship: Whether this is an internship, 1 for yes, 0 for no +- start_date: Start date, format %Y.%m or %Y, e.g. "2024.1" +- end_date: End date, use "Present" if still employed, "" if not available +- desc_lines: [start_line, end_line], line number range for job description (integer array) + - Refers to the original text reference range for job description, including achievements, responsibilities, tech stack, etc. + - Include as much as possible until the next work experience entry or other section heading + - STOP before these section headings (do not include them in desc_lines): + Self-evaluation, Personal Summary, Skills, Technical Skills, Education, Project Experience, Certificates, Languages, Hobbies, Career Objective + - Use [] if not available + +Example: +[22]: Google Inc. 2021.11-2022.11 Senior Engineer +[23]: Job description: Responsible for backend development +[24]: Achieved 99.9% uptime for core services +Then desc_lines should be [23, 24] + +Return JSON only. /no_think \ No newline at end of file