我有2个数据框(员工和招聘),但是,这些df之间没有唯一的密钥。我正在使用Fuzzywuzzy库函数来获得唯一标识符,为此,我正在使用df.apply函数,但是它非常慢。有人可以建议更快的实现方式吗?
#function to compare name beween 2 dfs.
def get_ratio(row, var1):
#Get Ratio with all names #take highest and subset
name1 = row.Offer_CandidateName
name2 = row.Offer_FirstName
name3 = row.Applicant_FirstName
name4 = row.Candidate_FirstName
#Compare dataframe based on name
if(not pd.isnull(name1)):
MatchScore1 = fuzz.partial_ratio(name1 , var1)
if(not pd.isnull(name2)):
MatchScore2 = fuzz.partial_ratio(name2 , var1)
if(not pd.isnull(name3)):
MatchScore3 = fuzz.partial_ratio(name3 , var1)
if(not pd.isnull(name4)):
MatchScore4 = fuzz.partial_ratio(name4 , var1)
#if (MatchScore1>50 |MatchScore1>50 |MatchScore1>50 |MatchScore1>50):
return max(MatchScore1,MatchScore2,MatchScore3,MatchScore4)
#Values are passed from employee dataframe to below function
def populateMatch(username,Emp_HireDate, Emp_RMID, Emp_FirstName): #Using Fuzzy Logic
#temp dataframe has names of candidate which need to be compared to all emp names
temp = temp[temp.apply(get_ratio, var1 = Emp_FirstName, axis=1) > 50]
EmpMasterDF.loc[EmpMasterDF.Username == username , 'c6'] = \
str(temp['combined'].unique().tolist())