我正在研究这个数据集: https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)
我最初编写了这段代码:
std::vector
这对我来说绝对没问题。
为了方便起见,我接着使用了pandas,这是我修改过的代码:
import bso as opt
from sklearn import svm
import numpy as np
with open("breastcancer/train_data.txt") as f:
tr_d=np.array([[float(d) for d in data.split(',')] for data in
f.read().splitlines()])
with open("breastcancer/test_data.txt") as f:
te_d=np.array([[float(d) for d in data.split(',')] for data in
f.read().splitlines()])
with open("breastcancer/train_data_label.txt") as f:
tr_l=np.array([int(data) for data in f.read().splitlines()])
with open("breastcancer/test_data_label.txt") as f:
te_l=np.array([int(data) for data in f.read().splitlines()])
def check(gen,tr_d,tr_l,te_d,te_l):
mask=np.array(gen) > 0
al_data=np.array([al[mask] for al in tr_d])
al_test_data=np.array([al[mask] for al in te_d])
res=svm.LinearSVC().fit(al_data,tr_l).predict(al_test_data)
score=np.count_nonzero(te_l==res)/len(te_l)
return score
gen1=[1]*9
print("all_feature:\n\t{0} {1}
{2}".format("".join(map(str,gen1)),check(gen1,tr_d,tr_l,te_d,te_l),len(gen1)))
class Evaluate:
def __init__(self):
self.train_l=tr_l
self.train_d=tr_d
self.test_l=te_l
self.test_d=te_d
self.dim=len(tr_d[0])
def evaluate(self,gen):
mask=np.array(gen) > 0
print (mask)
al_data=np.array([al[mask] for al in self.train_d])
al_test_data=np.array([al[mask] for al in self.test_d])
res=svm.LinearSVC().fit(al_data,self.train_l).predict(al_test_data)
score=np.count_nonzero(self.test_l==res)/len(self.test_l)
return score
def check_dimentions(self,dim):
if dim==None:
return len(self.train_d[0])
else:
return dim
我现在收到此错误:
import bso as opt
from sklearn import svm
import numpy as np
import sys
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
try:
data_df = pd.read_csv("breast-cancer-wisconsin.csv")
except Exception as e:
print(e)
sys.exit(1)
data_df.columns = ['id', 'f0', 'f1','f2','f3','f4','f5','f6','f7','f8','label']
data_df.drop(['id'],axis=1,inplace=True)
data_df['label'] = [0 if x == 2 else 1 for x in data_df['label']]
X = data_df.drop(['label'],axis=1)
y = data_df.label
tr_d, te_d, tr_l, te_l = train_test_split(X,y,test_size=0.3,random_state=42)
def check(gen,tr_d,tr_l,te_d,te_l):
mask=np.array(gen) > 0
al_data=np.array([al[mask] for al in tr_d])
al_test_data=np.array([al[mask] for al in te_d])
rfc= RandomForestClassifier(n_estimators=10)
rfc.fit(tr_d,tr_l)
score = rfc.score(te_d,te_l)
return score
gen1=[1]*9
print("all_feature:\n\t{0} {1} {2}".format("".join(map(str,gen1)),check(gen1,tr_d,tr_l,te_d,te_l),len(gen1)))
class Evaluate:
def __init__(self):
self.train_l=tr_l
self.train_d=tr_d
self.test_l=te_l
self.test_d=te_d
self.dim=9
def evaluate(self,gen):
mask=np.array(gen) > 0
al_data=np.array([al[mask] for al in self.train_d])
al_test_data=np.array([al[mask] for al in self.test_d])
rfc= RandomForestClassifier(n_estimators=10)
res=rfc.fit(tr_d,tr_l)
score = rfc.score(te_d,te_l)
return score
def check_dimentions(self,dim):
if dim==None:
return 9
else:
return dim
我在第31行得到了这个,有关如何解决这个问题的任何帮助,所以我可以在后面的代码中使用这个功能会很棒。