使用'glm'logit进行交叉验证

时间:2019-06-20 03:02:00

标签: r cross-validation glm

当我尝试运行R代码时:

fit <- train(V16~., data=credit,
             method="glm", 
             family="binomial",
             trControl=ctrl, 
             preProcess = c("center", "scale"))

...但是我遇到了以下错误:

1)

Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut = 10,  :
These variables have zero variances: V7.o
There were 50 or more warnings (use warnings() to see the first 50)

2)

> warnings()
Mensagens de aviso:
1: In train.default(x, y, weights = w, ...) :
  You are trying to do regression and your outcome only has two possible values Are you trying to do classification? If so, use a 2 level factor as your outcome column.
2: glm.fit: fitted probabilities numerically 0 or 1 occurred
3: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
4: glm.fit: fitted probabilities numerically 0 or 1 occurred
5: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
6: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
7: glm.fit: fitted probabilities numerically 0 or 1 occurred
8: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
9: glm.fit: fitted probabilities numerically 0 or 1 occurred
10: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
11: glm.fit: fitted probabilities numerically 0 or 1 occurred
12: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
13: glm.fit: fitted probabilities numerically 0 or 1 occurred
14: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
15: glm.fit: fitted probabilities numerically 0 or 1 occurred
16: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
17: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
18: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
19: glm.fit: fitted probabilities numerically 0 or 1 occurred
20: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
21: glm.fit: fitted probabilities numerically 0 or 1 occurred
22: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
23: glm.fit: fitted probabilities numerically 0 or 1 occurred
24: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
25: glm.fit: fitted probabilities numerically 0 or 1 occurred
26: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
27: glm.fit: fitted probabilities numerically 0 or 1 occurred
28: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
29: glm.fit: fitted probabilities numerically 0 or 1 occurred
30: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
31: glm.fit: fitted probabilities numerically 0 or 1 occurred
32: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
33: glm.fit: fitted probabilities numerically 0 or 1 occurred
34: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
35: glm.fit: fitted probabilities numerically 0 or 1 occurred
36: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
37: glm.fit: fitted probabilities numerically 0 or 1 occurred
38: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
39: glm.fit: fitted probabilities numerically 0 or 1 occurred
40: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
41: glm.fit: fitted probabilities numerically 0 or 1 occurred
42: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
43: glm.fit: fitted probabilities numerically 0 or 1 occurred
44: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
45: glm.fit: fitted probabilities numerically 0 or 1 occurred
46: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
47: glm.fit: fitted probabilities numerically 0 or 1 occurred
48: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading
49: glm.fit: fitted probabilities numerically 0 or 1 occurred
50: In predict.lm(object, newdata, se.fit, scale = 1, type = if (type ==  ... :
  prediction from a rank-deficient fit may be misleading

我对代码进行了很多修改,但是我没有找到错误所在。

我已经尝试过了,但是没有成功:

install.packages("ROCR")
install.packages("dummies")
install.packages("caret") 
install.packages("e1071") 
library(ROCR)
library(dummies)
library(caret) # for Cross Validation functions
library(e1071)

# Leia os dados de 
# https://archive.ics.uci.edu/ml/machine-learning-databases/credit- 
screening/crx.data
#
# dica: os dados nao possuem cabecalho

credit = read.csv("https://archive.ics.uci.edu/ml/machine-learning- 
databases/credit-screening/crx.data", header=FALSE)

cat("credit - rows and columns dataset:", nrow(credit) ," rows ", 
ncol(credit), "columns ", "\n")

# Elimine as linha com valores ausentes "?"
#
# dica: primeiro troque os valores "?" por NA e em seguida use na.omit() 

credit[credit[]=="?"] <- NA
credit = na.omit(credit)

# Valores numericos com NA aparecem como caracteres. Converta esses 
valores
# para numerico com as.numeric (V2, V14)  
#
credit$V2 = as.numeric(credit$V2)
credit$V14 = as.numeric(credit$V14)

# Converta o atributo de classe para valores 0 e 1

credit$V16 = as.numeric(credit$V16)

credit$V16 = credit$V16=replace(credit$V16,credit$V16==2,0)

cat("credit - rows and columns after NA omit:", nrow(credit) ," rows ", 
ncol(credit), "columns ", "\n")

# Aplique o dummy encode para todos os atributos categoricos

# dica: empregue o comando dummy da library(dummies)

credit = dummy.data.frame(credit, names = 
c("V1","V4","V5","V6","V7","V9","V10","V12","V13"), sep = ".")

cat("credit - rows and columns after dummy encode:", nrow(credit) ," rows 
", ncol(credit), "columns ", "\n")
head(credit)

####inserido aqui após pesquisa no stackoverflow####
#converte o atributo de classe para factor
credit$V16 = as.factor(credit$V16)
credit$V16 = ifelse(credit$V16==1, "No", "Yes")

# Logit com Cross Validation 
# dica: empregue o codigo da tarefa 2 como exemplo  

# Crie o arquivo de controle para 20 particoes dos dados e 5 repeticoes 
ctrl <- trainControl(method="repeatedcv", number= 20, repeats=5)

# Faca o treinamento logistico, nao esquecer de empregar preProcess = 
c("center", "scale")
fit <- train(V16~., data=credit,
            method="glm", 
            family="binomial",
            trControl=ctrl, 
            preProcess = c("center", "scale"))
fit

# conjunto de treinamento e teste
T = sample(1:nrow(credit), round(0.3*nrow(credit)))

credit_test=credit[T,]
credit_train=credit[-T,]

# Faca a predicao para todos os valores de credit
predict_test = predict(fit, newdata=credit_test, type="raw")
predict_test

# Converte predict_test para valores 0 e 1 
predict_test = as.numeric(predict_test)
predict_test = predict_test=replace(predict_test,predict_test==2,0)

# Construa a matriz de confusao 
c_matrix=table(credit_test$V16, predict_test)
print(c_matrix)

# Calcula a acuracidade 
cat('Accuracy: ', sum(diag(c_matrix))/sum(c_matrix)*100, ' %', "\n")

# Plot da curva ROC
pr=prediction(as.numeric(predict_test),credit_test$V16) 
prf=performance(pr, measure="tpr", x.measure="fpr")
plot(prf,colorize=TRUE)

# Calcule a area sob a curva ROC
auc=performance(pr, measure="auc")
auc=auc@y.values[[1]]
auc

第1项中描述的两个错误篮子。

0 个答案:

没有答案