mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-07-06 03:18:36 +08:00
193 lines
6.0 KiB
Python
193 lines
6.0 KiB
Python
#
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import json
|
|
from deepdoc.parser.resume.entities import degrees, regions, industries
|
|
|
|
FIELDS = [
|
|
"address STRING",
|
|
"annual_salary int",
|
|
"annual_salary_from int",
|
|
"annual_salary_to int",
|
|
"birth STRING",
|
|
"card STRING",
|
|
"certificate_obj string",
|
|
"city STRING",
|
|
"corporation_id int",
|
|
"corporation_name STRING",
|
|
"corporation_type STRING",
|
|
"degree STRING",
|
|
"discipline_name STRING",
|
|
"education_obj string",
|
|
"email STRING",
|
|
"expect_annual_salary int",
|
|
"expect_city_names string",
|
|
"expect_industry_name STRING",
|
|
"expect_position_name STRING",
|
|
"expect_salary_from int",
|
|
"expect_salary_to int",
|
|
"expect_type STRING",
|
|
"gender STRING",
|
|
"industry_name STRING",
|
|
"industry_names STRING",
|
|
"is_deleted STRING",
|
|
"is_fertility STRING",
|
|
"is_house STRING",
|
|
"is_management_experience STRING",
|
|
"is_marital STRING",
|
|
"is_oversea STRING",
|
|
"language_obj string",
|
|
"name STRING",
|
|
"nation STRING",
|
|
"phone STRING",
|
|
"political_status STRING",
|
|
"position_name STRING",
|
|
"project_obj string",
|
|
"responsibilities string",
|
|
"salary_month int",
|
|
"scale STRING",
|
|
"school_name STRING",
|
|
"self_remark string",
|
|
"skill_obj string",
|
|
"title_name STRING",
|
|
"tob_resume_id STRING",
|
|
"updated_at Timestamp",
|
|
"wechat STRING",
|
|
"work_obj string",
|
|
"work_experience int",
|
|
"work_start_time BIGINT",
|
|
]
|
|
|
|
|
|
def refactor(df):
|
|
def deal_obj(obj, k, kk):
|
|
if not isinstance(obj, type({})):
|
|
return ""
|
|
obj = obj.get(k, {})
|
|
if not isinstance(obj, type({})):
|
|
return ""
|
|
return obj.get(kk, "")
|
|
|
|
def loadjson(line):
|
|
try:
|
|
return json.loads(line)
|
|
except Exception:
|
|
pass
|
|
return {}
|
|
|
|
df["obj"] = df["resume_content"].map(lambda x: loadjson(x))
|
|
df.fillna("", inplace=True)
|
|
|
|
clms = ["tob_resume_id", "updated_at"]
|
|
|
|
def extract(nms, cc=None):
|
|
nonlocal clms
|
|
clms.extend(nms)
|
|
for c in nms:
|
|
if cc:
|
|
df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c))
|
|
else:
|
|
df[c] = df["obj"].map(
|
|
lambda x: json.dumps(x.get(c, {}), ensure_ascii=False) if isinstance(x, type({})) and (isinstance(x.get(c), type({})) or not x.get(c)) else str(x).replace("None", "")
|
|
)
|
|
|
|
extract(["education", "work", "certificate", "project", "language", "skill"])
|
|
extract(["wechat", "phone", "is_deleted", "name", "tel", "email"], "contact")
|
|
extract(
|
|
[
|
|
"nation",
|
|
"expect_industry_name",
|
|
"salary_month",
|
|
"industry_ids",
|
|
"is_house",
|
|
"birth",
|
|
"annual_salary_from",
|
|
"annual_salary_to",
|
|
"card",
|
|
"expect_salary_to",
|
|
"expect_salary_from",
|
|
"expect_position_name",
|
|
"gender",
|
|
"city",
|
|
"is_fertility",
|
|
"expect_city_names",
|
|
"political_status",
|
|
"title_name",
|
|
"expect_annual_salary",
|
|
"industry_name",
|
|
"address",
|
|
"position_name",
|
|
"school_name",
|
|
"corporation_id",
|
|
"is_oversea",
|
|
"responsibilities",
|
|
"work_start_time",
|
|
"degree",
|
|
"management_experience",
|
|
"expect_type",
|
|
"corporation_type",
|
|
"scale",
|
|
"corporation_name",
|
|
"self_remark",
|
|
"annual_salary",
|
|
"work_experience",
|
|
"discipline_name",
|
|
"marital",
|
|
"updated_at",
|
|
],
|
|
"basic",
|
|
)
|
|
|
|
df["degree"] = df["degree"].map(lambda x: degrees.get_name(x))
|
|
df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x)))
|
|
df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in str(x).split(",")]))
|
|
clms.append("industry_names")
|
|
|
|
def arr2str(a):
|
|
if not a:
|
|
return ""
|
|
if isinstance(a, list):
|
|
a = " ".join([str(i) for i in a])
|
|
return str(a).replace(",", " ")
|
|
|
|
df["expect_industry_name"] = df["expect_industry_name"].map(lambda x: arr2str(x))
|
|
df["gender"] = df["gender"].map(lambda x: "男" if x == "M" else ("女" if x == "F" else ""))
|
|
for c in ["is_fertility", "is_oversea", "is_house", "management_experience", "marital"]:
|
|
df[c] = df[c].map(lambda x: "是" if x == "Y" else ("否" if x == "N" else ""))
|
|
df["is_management_experience"] = df["management_experience"]
|
|
df["is_marital"] = df["marital"]
|
|
clms.extend(["is_management_experience", "is_marital"])
|
|
|
|
df.fillna("", inplace=True)
|
|
for i in range(len(df)):
|
|
if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip():
|
|
df.loc[i, "phone"] = df.loc[i, "tel"].strip()
|
|
|
|
for n in ["industry_ids", "management_experience", "marital", "tel"]:
|
|
for i in range(len(clms)):
|
|
if clms[i] == n:
|
|
del clms[i]
|
|
break
|
|
|
|
clms = list(set(clms))
|
|
|
|
df = df.reindex(sorted(clms), axis=1)
|
|
# print(json.dumps(list(df.columns.values)), "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL")
|
|
for c in clms:
|
|
df[c] = df[c].map(lambda s: str(s).replace("\t", " ").replace("\n", "\\n").replace("\r", "\\n"))
|
|
# print(df.values.tolist())
|
|
return dict(zip([n.split()[0] for n in FIELDS], df.values.tolist()[0]))
|