我正在使用一个项目的快速约会数据集,对其应用了不同的分类器算法。注意到匹配数(1380)远低于“不匹配”数(6998)后,我想看看分类器在数字相等时的性能,因此尝试调整所述数据集的大小。问题是调整大小后,.casefold()
函数(在最后一个for循环中找到)返回KeyError:1666(这是最后一个“不匹配”的索引,然后才开始从DataFrame中删除项目),坦率地说,我不明白为什么。
speed = pd.read_excel('speeddatingdataset.xls')
# Resizing the data set
zeros = 0
for i in range(len(speed)):
if(speed.at[i,'match'] == 0 and zeros <1381):
zeros+=1
if(zeros >= 1381 and speed.at[i,'match'] == 0):
speed = pd.DataFrame.drop(speed, labels = i, axis = 0)
# Preprocessing of the Data Set stage
match = speed['match'] # save the class values separately before removing from dataframe
speed = pd.DataFrame.drop(speed,columns=['has_null','wave','match','decision','decision_o']) # dropped the has_null and wave columns, as they don't add information
def replace_race(i,fName):
if(speed.at[i,fName] == "?"):
speed.at[i,fName] = 0
elif(speed.at[i,fName] == "European/Caucasian-American"):
speed.at[i,fName] = 1
elif(speed.at[i,fName] == "'Black/African American'"):
speed.at[i,fName] = 2
elif(speed.at[i,fName] == "'Latino/Hispanic American'"):
speed.at[i,fName] = 3
elif(speed.at[i,fName] == "'Asian/Pacific Islander/Asian-American'"):
speed.at[i,fName] = 4
elif(speed.at[i,fName] == "Other"):
speed.at[i,fName] = 5
def replace_field():
proffesions=list()
for i in range(0,len(speed)):
if(len(proffesions) == 0):
proffesions.append(speed.at[i,'field'])
speed.at[i,'field'] = proffesions.index(speed.at[i,'field'])
elif(proffesions.count(speed.at[i,'field']) > 0):
speed.at[i,'field'] = proffesions.index(speed.at[i,'field'])
elif(proffesions.count(speed.at[i,'field']) == 0):
proffesions.append(speed.at[i,'field'])
speed.at[i,'field'] = proffesions.index(speed.at[i,'field'])
for i in range(0,len(speed)):
speed.at[i,'field'] = speed.at[i,'field'].casefold()
if (speed.at[i,'field'].startswith('"') or speed.at[i,'field'].startswith('\'')):
speed.at[i,'field'] = speed.at[i,'field'][1:]
if (speed.at[i,'field'].endswith('"') or speed.at[i,'field'].endswith('\'')):
speed.at[i,'field'] = speed.at[i,'field'][:(len(speed.at[i,'field'])-1)]
if (speed.at[i,'gender'] == '?') :
speed.at[i,'gender'] = 0
elif(speed.at[i,'gender'] == "female"):
speed.at[i,'gender'] = 1
elif(speed.at[i,'gender'] == "male"):
speed.at[i,'gender'] = 2
replace_race(i,'race')
replace_race(i,'race_o')
replace_field()