我正在使用LendingClub关于2012 - 2015年批准贷款的公开数据:https://www.lendingclub.com/info/download-data.action
它看起来像什么
library(C50)
library(dplyr)
library(ggplot2)
#Preprocessed in terminal with 'tail -n +2 LoanStats3c.csv > approved.csv'
setwd("~/Dropbox/dataprojects/Lending Club/")
approval <- read.csv("approved.csv")
approved_2014 <- read.csv("approved_2014.csv")
approved_2013 <- read.csv("approved_2013.csv")
#Create a big dataframe
approved <- rbind(approval, approved_2014, approved_2013)
#Filter it
approved <- filter(approved, application_type == "INDIVIDUAL", purpose=="debt_consolidation")
keep <- c("loan_status","addr_state", "annual_inc", "delinq_2yrs", "dti", "grade","sub_grade", "home_ownership", "emp_length", "loan_amnt", "installment", "int_rate", "open_acc", "zip_code", "inq_last_6mths", "verification_status", "pub_rec", "term", "revol_bal", "revol_util")
approved <- approved[keep]
#Scramble it
set.seed(12345)
approved <- approved[order(runif(429949)), ]
#Drop Unused Levels -- necessary for C5.0
approved <- droplevels(approved)
num_examples = dim(approved)[1]
#Prepares class variables.
approved$loan_status <- as.character(approved$loan_status)
approved$loan_status[approved$loan_status=="Charged Off"] <- "0"
approved$loan_status[approved$loan_status=="Default"] <- "0"
approved$loan_status[approved$loan_status=="Late (31-120 days)"] <- "0"
approved$loan_status[approved$loan_status=="Current"] <- "1"
approved$loan_status[approved$loan_status=="Fully Paid"] <- "1"
approved$loan_status[approved$loan_status=="Issued"] <- "1"
approved$loan_status[approved$loan_status=="In Grace Period"] <- "1"
approved$loan_status[approved$loan_status=="Late (16-30 days)"] <- "1"
approved$loan_status[approved$loan_status==""] <- "missing"
approved$loan_status <- as.factor(approved$loan_status)
summary(approved$loan_status)
#Change these to numerics so they can be normalized
approved$int_rate <- as.numeric(sub("%", "", approved$int_rate))
approved$revol_util <- as.numeric(sub("%", "", approved$revol_util))
approved$term <- as.numeric(sub("months", "", approved$term))
#Normalize Numeric Columns
ind <- sapply(approved, is.numeric)
approved[ind] <- lapply(approved[ind], scale)
levels(approved)[levels(approved) == ""] <- "missing"
#Create Train/Test Split
split_pt2 = ceiling(num_examples*0.7)
split_pt1 = floor(num_examples*0.7)
approved_train <- approved[1:split_pt1,]
approved_test <- approved[split_pt2:num_examples,]
train_class <- approved_train$loan_status
test_class <- approved_test$loan_status
#Remove the class from the training/testing data
approved_train <- approved_train[,2:20]
approved_test <- approved_test[,2:20]
m <- C5.0(approved_train, train_class, trials=10)
summary(m)
p <- predict(m, approved_test)
summary(p)
我是R&amp; S的新手C5.0,所以我可能会忽略一些非常明显的东西。不知道这里有什么不对。感谢您提供的任何见解。
Call:
C5.0.default(x = approved_train, y = train_class, trials = 10)
C5.0 [Release 2.07 GPL Edition] Tue Nov 24 16:38:41 2015
-------------------------------
Class specified by attribute `outcome'
Read 128985 cases (20 attributes) from undefined.data
----- Trial 0: -----
Decision tree:
1 (128985/8250)
----- Trial 1: -----
Decision tree:
1 (128985/36371.2)
*** boosting reduced to 1 trial since last classifier is very inaccurate
*** boosting abandoned (too few classifiers)
Evaluation on training data (128985 cases):
Decision Tree
----------------
Size Errors
1 8250( 6.4%) <<
(a) (b) <-classified as
----- -----
8250 (a): class 0
120735 (b): class 1
答案 0 :(得分:0)
我很确定原因是没有任何预测因子不足以创建决策分支。请记住,您的数据集中的贷款已经由Lending Club筛选。如果其中任何一个可能违约,Lending Club将不会提供。
以下是class
与x1
和x2
之间无关系的示例。正如所料,C5.0
返回决策树大小1.
library(C50)
data <- data.frame(y=runif(50), x1=runif(50), x2=runif(50))
class <- as.factor(ifelse(data$y<0.5,"yes","no"))
tree <- C5.0(data[,-1],class)
tree
## Tree size: 1
以下是class
和x1
之间存在关系的示例。正如所料,C5.0
返回决策树大小2.
class <- as.factor(ifelse(data$x1<0.5,"yes","no"))
tree <- C5.0(data[,-1],class)
tree
## Tree size: 2