创建k - 折叠分析

Question

创建k - 折叠分析

from scipy.stats import itemfreq
from os import listdir
from os.path import isfile, join
import numpy as np
import pickle
from csv import reader
from scipy.stats import itemfreq
from sklearn.model_selection import KFold
from os import listdir
from os.path import isfile, join
from astropy.extern.ply.cpp import xrange

seed = 0 # number of cough classes = 2; (DISEASE/NORMAL) or (COPD/CHF)
np.random.seed(seed) # generates random numbers
X_train = [] # creates training set using .csv file
Y_train = [] #creates training set using patients
X_test = [] # creates testing set using .csv file
Y_test = [] # creates testing set using patients
Z = [] # splits data
label = [] #labels split data
eps=1e-7
set_probs = [] #predicts probability
i = 0; # uses to go through all patients
correct = 0;
DISEASE = 1;
NORMAL = 1;

for i in xrange (1,10): # goes through all 9 patients
    Z.append(DISEASE)
    DISEASE = DISEASE + 1;
    label.append(1); #labels data as 1, if = DISEASE

for i in xrange (1,10):
    Z.append(NORMAL)
    NORMAL = NORMAL + 1;
    label.append(2); #labels data as 2, if = NORMAL

add = 0
add1 = 0
add2 = 0
print(len(Z))


kf = KFold(n_splits = 10, shuffle = True) 
for train, test in kf.split(Z):
    X_train = []
    Y_train = []
    X_test = []
    Y_test = []
    set_probs = []

    # Z_train - creates training set from split data
    # Z_test - creates testing set from split data
    # label_train - labels Z_train data
    # label_test - labels Z_testing data

    # This where I am getting the error
   Z_train, Z_test, label_train, label_test = Z[train], Z[test], label[train], 
   label[test]

# training set
for z in xrange(0, len(Z_train)):
    if label_train[z] == 1: # if predicted 1 = DISEASE
        mypath = '~/Users/awindmon/Documents/DISEASE_Example/';

    if label_train[z] == 2: # if predicted 2 = NORMAL
        mypath = '~/Users/awindmon/Documents/NORMAL_Example/';

# testing set
for z in xrange(0, len(Z_test)):
    if label_test[z] == 1:
        mypath = '~/Users/awindmon/Documents/DISEASE_Example/';

    if label_test[z] == 2:
        mypath = '~/Users/awindmon/Documents/NORMAL_Example/';

   clf = SVC (kernel = 'linear', random_state = 0, gamma = 1, C = 1, 
   probability = True)

   clf.fit(X_train, Y_train)
   filename = 'LinearSVM_Model.sav'
   pickle.dump(clf, open(filename, 'wb'))
   count = 0

   probability_list = clf.predict_proba(X_test)
   p0=0
   p1=0
   p2=0
   p3=0
   p4=0
   p5=0
   p6=0

for  l in range(0,len(probability_list)):
    if (l!=0) and (l%3 == 0):
        set_probs.append([p0,p1,p2,p3,p4,p5,p6])
        p0=0
        p1=0
        p2=0
        p3=0
        p4=0
        p5=0
        p6=0

    p0=p0+ probability_list[l][0]
    p1=p1+ probability_list[l][1]
    p2=p2+ probability_list[l][2]
    p3=p3+ probability_list[l][3]
    p4=p4+ probability_list[l][4]
    p5=p5+ probability_list[l][5]
    p6=p6+ probability_list[l][6]

    if (l == len(probability_list)-1):
        set_probs.append([p0,p1,p2,p3,p4,p5,p6])
        p0=0
        p1=0
        p2=0
        p3=0
        p4=0
        p5=0
        p6=0

    print (set_probs,Y_test)

    add1=add1+clf.score(X_test, Y_test)

    print (add1/10)

我是python的新手，我已经开发了这个代码来进行机器学习问题的k-fold交叉验证。在最后一行，我试图划分和标记我的训练和测试数据，但我不断收到此错误：TypeError：只有整数标量数组可以转换为标量索引。

Answer 1

值train和test是np数组。你可能想做这样的事情：

from sklearn.model_selection import KFold
import numpy as np

kf = KFold(n_splits = 10, shuffle = True)
Z = [i+1 for i in range(10)]
for train, test in kf.split(Z):
    X_train = []
    Y_train = []
    X_test = []
    Y_test = []
    set_probs = []
    Z_train = np.array([Z[i] for i in train])
    label_train = np.array([label[i] for i in train])

    Z_test = Z[test[0]]
    label_test = label[test[0]]

拆分为您提供了所选的索引，因此您可以使用该索引以类似的方式获取值或标签。

这超出了这个问题的范围，但是你如何使用Z_train对我没有意义，因为你只是使用它的长度而不是它的值。也许你的意思是for z in Z_train:

TypeError：只能将整数标量数组转换为标量索引 - Python 3.6.5

创建k - 折叠分析

1 个答案: