关于矩阵误差和逻辑回归的困惑

时间:2018-06-03 17:45:55

标签: logistic-regression confusion-matrix

有问题:

  1. 状态具有分类变量A,B,C和D,并使用cbind使用逻辑回归模型。请确认这种方法是否正确?

  2. 运行混淆矩阵时出现以下错误(错误:数据和引用应该是具有相同级别的因子)。

  3. R-代码: -

    loan <- read.csv("C:/Users/sao/Downloads/banking_data/Banking_Data/loan.txt", sep=';')
    trans <- read.csv("C:/Users/sao/Downloads/banking_data/Banking_Data/trans.txt", sep=';') 
    trans <- subset(trans, select = c(account_id,balance,k_symbol))
    
    loanaccount <- merge(trans, loan, by="account_id")
    loanaccount <- subset(loanaccount,select = -c(loan_id))
    
    ##checking missing value
    is.na(loanaccount)
    which(is.na(loanaccount))
    
    ##duplicated values
    unique(loanaccount)
    distinct(loanaccount) 
    
    ## create training and test data
    install.packages("DMwR")
    library(DMwR)
    str(loanaccount)
    
    ##data split
    datasplit <- sample(nrow(loanaccount), round(nrow(loanaccount)*0.8))
    trainigdata <- loanaccount[datasplit,]
    testdata <- loanaccount[-datasplit,]
    unique(trainigdata)
    
    
    ## loan amount distribution and box plot
    library(ggplot2)
    
    give_count <-  stat_summary(fun.data = function(x) return(c(y = median(x)*1.06,                                             label = length(x))),
                   geom = "text")
    
    give_mean <- 
      stat_summary(fun.y = mean, colour = "darkgreen", geom = "point", 
                   shape = 18, size = 3, show.legend = FALSE)
    
    ggplot(trainigdata, aes(x=k_symbol, y=amount))+ +
      geom_boxplot(outlier.colour="black", outlier.shape=16,outlier.size=2, notch=FALSE) +
      give_count +
      give_mean +
      scale_y_continuous(labels = comma) +
      labs(title="Loan Amount by status", x = "loan purpose", y = "Loan Amount \n")
    
    
    
    ## summary on training dataset
    summary(trainigdata)
    summary(trainigdata$status)
    summary(trainigdata$k_symbol)
    
    ## t-test result
    install.packages("graphics")
    library(graphics)
    install.packages("pwr")
    library(pwr)
    install.packages("nparcomp")
    library(nparcomp)
    t.test(trainigdata$amount, testdata$amount)
    
    t.test(trainigdata$amount, loanaccount$amount)
    
    ## making tree model from train data
    install.packages("tree")
    library(tree)
    train.loan <- tree(status~.-duration-date-payments-account_id, testdata)
    plot(train.loan)
    text(train.loan, pretty=0)
    summary(train.loan)
    
    ## tree data prediction
    treeloanprediction <- predict(train.loan,trainigdata, type = "class")
    
    
    ##logistic regression
    
    lmloan <- glm(cbind(account_id,status)~.-payments,family="binomial", trainigdata)
    
    summary(lmloan)$coeff
    plot(lmloan)
    
    ##predict
    
    predictlm <- predict(lmloan,newdata = testdata, type="response")
    predictlm
    ## confufusion matrix sensitivy, secifity
    library(heuristica)
    library(caret)
    library(ROCR)
    library(stringi)
    model_glm <- predict.glm(lmloan, testdata, type = "response", na.action = na.pass)
    model_predict <- function(pred, t) ifelse (pred>t, TRUE, FALSE)
    testdata <- testdata[complete.cases(testdata),]
    caret::confusionMatrix(model_predict(model_glm, 0.5), reference = testdata, positive="TRUE")
    
    
    ## test set area under the curve
    library(ROCR)
    
    rocrpred <- prediction(model_glm, trainigdata$status)
    
    pred <- prediction(predicttestdata,testdata$status)
    
    as.numeric(performance(pred, "auc")@y.values)
    

0 个答案:

没有答案