from scipy.stats import itemfreq
from os import listdir
from os.path import isfile, join
import numpy as np
import pickle
from csv import reader
from scipy.stats import itemfreq
from sklearn.model_selection import KFold
from os import listdir
from os.path import isfile, join
from astropy.extern.ply.cpp import xrange
seed = 0 # number of cough classes = 2; (DISEASE/NORMAL) or (COPD/CHF)
np.random.seed(seed) # generates random numbers
X_train = [] # creates training set using .csv file
Y_train = [] #creates training set using patients
X_test = [] # creates testing set using .csv file
Y_test = [] # creates testing set using patients
Z = [] # splits data
label = [] #labels split data
eps=1e-7
set_probs = [] #predicts probability
i = 0; # uses to go through all patients
correct = 0;
DISEASE = 1;
NORMAL = 1;
for i in xrange (1,10): # goes through all 9 patients
Z.append(DISEASE)
DISEASE = DISEASE + 1;
label.append(1); #labels data as 1, if = DISEASE
for i in xrange (1,10):
Z.append(NORMAL)
NORMAL = NORMAL + 1;
label.append(2); #labels data as 2, if = NORMAL
add = 0
add1 = 0
add2 = 0
print(len(Z))
kf = KFold(n_splits = 10, shuffle = True)
for train, test in kf.split(Z):
X_train = []
Y_train = []
X_test = []
Y_test = []
set_probs = []
# Z_train - creates training set from split data
# Z_test - creates testing set from split data
# label_train - labels Z_train data
# label_test - labels Z_testing data
# This where I am getting the error
Z_train, Z_test, label_train, label_test = Z[train], Z[test], label[train],
label[test]
# training set
for z in xrange(0, len(Z_train)):
if label_train[z] == 1: # if predicted 1 = DISEASE
mypath = '~/Users/awindmon/Documents/DISEASE_Example/';
if label_train[z] == 2: # if predicted 2 = NORMAL
mypath = '~/Users/awindmon/Documents/NORMAL_Example/';
# testing set
for z in xrange(0, len(Z_test)):
if label_test[z] == 1:
mypath = '~/Users/awindmon/Documents/DISEASE_Example/';
if label_test[z] == 2:
mypath = '~/Users/awindmon/Documents/NORMAL_Example/';
clf = SVC (kernel = 'linear', random_state = 0, gamma = 1, C = 1,
probability = True)
clf.fit(X_train, Y_train)
filename = 'LinearSVM_Model.sav'
pickle.dump(clf, open(filename, 'wb'))
count = 0
probability_list = clf.predict_proba(X_test)
p0=0
p1=0
p2=0
p3=0
p4=0
p5=0
p6=0
for l in range(0,len(probability_list)):
if (l!=0) and (l%3 == 0):
set_probs.append([p0,p1,p2,p3,p4,p5,p6])
p0=0
p1=0
p2=0
p3=0
p4=0
p5=0
p6=0
p0=p0+ probability_list[l][0]
p1=p1+ probability_list[l][1]
p2=p2+ probability_list[l][2]
p3=p3+ probability_list[l][3]
p4=p4+ probability_list[l][4]
p5=p5+ probability_list[l][5]
p6=p6+ probability_list[l][6]
if (l == len(probability_list)-1):
set_probs.append([p0,p1,p2,p3,p4,p5,p6])
p0=0
p1=0
p2=0
p3=0
p4=0
p5=0
p6=0
print (set_probs,Y_test)
add1=add1+clf.score(X_test, Y_test)
print (add1/10)
我是python的新手,我已经开发了这个代码来进行机器学习问题的k-fold交叉验证。在最后一行,我试图划分和标记我的训练和测试数据,但我不断收到此错误:TypeError:只有整数标量数组可以转换为标量索引。
答案 0 :(得分:0)
值train
和test
是np数组。你可能想做这样的事情:
from sklearn.model_selection import KFold
import numpy as np
kf = KFold(n_splits = 10, shuffle = True)
Z = [i+1 for i in range(10)]
for train, test in kf.split(Z):
X_train = []
Y_train = []
X_test = []
Y_test = []
set_probs = []
Z_train = np.array([Z[i] for i in train])
label_train = np.array([label[i] for i in train])
Z_test = Z[test[0]]
label_test = label[test[0]]
拆分为您提供了所选的索引,因此您可以使用该索引以类似的方式获取值或标签。
这超出了这个问题的范围,但是你如何使用Z_train对我没有意义,因为你只是使用它的长度而不是它的值。也许你的意思是for z in Z_train: