Question

有问题：

状态具有分类变量A，B，C和D，并使用cbind使用逻辑回归模型。请确认这种方法是否正确？
运行混淆矩阵时出现以下错误（错误：数据和引用应该是具有相同级别的因子）。

R-代码： -

loan <- read.csv("C:/Users/sao/Downloads/banking_data/Banking_Data/loan.txt", sep=';')
trans <- read.csv("C:/Users/sao/Downloads/banking_data/Banking_Data/trans.txt", sep=';') 
trans <- subset(trans, select = c(account_id,balance,k_symbol))

loanaccount <- merge(trans, loan, by="account_id")
loanaccount <- subset(loanaccount,select = -c(loan_id))

##checking missing value
is.na(loanaccount)
which(is.na(loanaccount))

##duplicated values
unique(loanaccount)
distinct(loanaccount) 

## create training and test data
install.packages("DMwR")
library(DMwR)
str(loanaccount)

##data split
datasplit <- sample(nrow(loanaccount), round(nrow(loanaccount)*0.8))
trainigdata <- loanaccount[datasplit,]
testdata <- loanaccount[-datasplit,]
unique(trainigdata)


## loan amount distribution and box plot
library(ggplot2)

give_count <-  stat_summary(fun.data = function(x) return(c(y = median(x)*1.06,                                             label = length(x))),
               geom = "text")

give_mean <- 
  stat_summary(fun.y = mean, colour = "darkgreen", geom = "point", 
               shape = 18, size = 3, show.legend = FALSE)

ggplot(trainigdata, aes(x=k_symbol, y=amount))+ +
  geom_boxplot(outlier.colour="black", outlier.shape=16,outlier.size=2, notch=FALSE) +
  give_count +
  give_mean +
  scale_y_continuous(labels = comma) +
  labs(title="Loan Amount by status", x = "loan purpose", y = "Loan Amount \n")



## summary on training dataset
summary(trainigdata)
summary(trainigdata$status)
summary(trainigdata$k_symbol)

## t-test result
install.packages("graphics")
library(graphics)
install.packages("pwr")
library(pwr)
install.packages("nparcomp")
library(nparcomp)
t.test(trainigdata$amount, testdata$amount)

t.test(trainigdata$amount, loanaccount$amount)

## making tree model from train data
install.packages("tree")
library(tree)
train.loan <- tree(status~.-duration-date-payments-account_id, testdata)
plot(train.loan)
text(train.loan, pretty=0)
summary(train.loan)

## tree data prediction
treeloanprediction <- predict(train.loan,trainigdata, type = "class")


##logistic regression

lmloan <- glm(cbind(account_id,status)~.-payments,family="binomial", trainigdata)

summary(lmloan)$coeff
plot(lmloan)

##predict

predictlm <- predict(lmloan,newdata = testdata, type="response")
predictlm
## confufusion matrix sensitivy, secifity
library(heuristica)
library(caret)
library(ROCR)
library(stringi)
model_glm <- predict.glm(lmloan, testdata, type = "response", na.action = na.pass)
model_predict <- function(pred, t) ifelse (pred>t, TRUE, FALSE)
testdata <- testdata[complete.cases(testdata),]
caret::confusionMatrix(model_predict(model_glm, 0.5), reference = testdata, positive="TRUE")


## test set area under the curve
library(ROCR)

rocrpred <- prediction(model_glm, trainigdata$status)

pred <- prediction(predicttestdata,testdata$status)

as.numeric(performance(pred, "auc")@y.values)

关于矩阵误差和逻辑回归的困惑

0 个答案: