我正在使用party包在R中构建CART回归树模型,但是当我尝试将模型应用于测试数据集时,我收到错误消息,指出级别不匹配。
过去一周我一直在阅读论坛上的主题,但仍无法找到解决问题的正确方法。所以我在这里使用我编写的假例子重新发布这个问题。有人可以帮助解释错误信息并提供解决方案吗?
我的训练数据集有大约1000条记录,测试数据集大约有150条。两个数据集中都没有NA或空白字段。
我在党派包装下使用ctree的CART模型是:
mytree< - ctree(Rate~Bank + Product + Salary,data = data_train)
data_train示例:
x<-c(1,0,0,0,1,-1,-1,0,1)
matchVals(x)
matchVals<-function(x){
current.index<-getStart(x)
#next VALUE to search for should be:
next.val<-x[current.index]*-1
next.index<-getNextVal(x[current.index:length(x)],next.val) + current.index #adding in offset
return(current.index,next.index)
}
#gets index of first value that isnt a 0
getStart<-function(x){
lapply(1:length(x),function(i){if(x[i]!=0)return(i)})
return(NA)
}
#gets FIRST index of the value specified (so dont feed it the entire string). Must add offset for truncated portion of string
getNextVal<-function(x,v){
lapply(1:length(x),function(i){if(x[i]==v)return(i)})
return(NA)
}
data_test示例:
Rate Bank Product Salary
1.5 A aaa 100000
0.6 B abc 60000
3 C bac 10000
2.1 D cba 50000
1.1 E cca 80000
我尝试使用以下代码设置为相同级别:
Rate Bank Product Salary
2.0 A cba 80000
0.5 D cca 250000
0.8 E cba 120000
2.1 C abc 65000
levels(data_train$Bank) : A, B, C, D, E
levels(data_test$Bank): A,D,E,C
但是,当我尝试在测试数据集上运行预测时,我收到以下错误:
>is.factor(data_test$Bank)
TRUE
(Made sure Bank and Products are factors in both datasets)
>levels(data_test$Bank) <-union(levels(data_test$Bank), levels(data_train$Bank))
> levels(data_test$product)<-union(levels(data_test$product),levels(data_train$product))
我也尝试了以下方法,但它改变了我的测试数据集的字段......:
级别(data_test $ Bank)&lt; -levels(data_train $ Bank)
data_test表被更改:
> fit1<- predict(mytree,newdata=data_test)
Error in checkData(oldData, RET) :
Levels in factors of new data do not match original data
答案 0 :(得分:1)
您可以尝试使用可比较的级别重建因子,而不是为现有因素指定新级别。这是一个例子:
# start the party
library(party)
# create training data sample
data_train <- data.frame(Rate = c(1.5, 0.6, 3, 2.1, 1.1),
Bank = c("A", "B", "C", "D", "E"),
Product = c("aaa", "abc", "bac", "cba", "cca"),
Salary = c(100000, 60000, 10000, 50000, 80000))
# create testing data sample
data_test <- data.frame(Rate = c(2.0, 0.5, 0.8, 2.1),
Bank = c("A", "D", "E", "C"),
Product = c("cba", "cca", "cba", "abc"),
Salary = c(80000, 250000, 120000, 65000))
# get the union of levels between train and test for Bank and Product
bank_levels <- union(levels(data_test$Bank), levels(data_train$Bank))
product_levels <- union(levels(data_test$Product), levels(data_train$Product))
# rebuild Bank with union of levels
data_test$Bank <- with(data_test, factor(Bank, levels = bank_levels))
data_train$Bank <- with(data_train, factor(Bank, levels = bank_levels))
# rebuild Product with union of levels
data_test$Product <- with(data_test, factor(Product, levels = product_levels))
data_train$Product <- with(data_train, factor(Product, levels = product_levels))
# fit the model
mytree <- ctree(Rate ~ Bank + Product + Salary, data = data_train)
# generate predictions
fit1 <- predict(mytree, newdata = data_test)
> fit1
Rate
[1,] 1.66
[2,] 1.66
[3,] 1.66
[4,] 1.66
答案 1 :(得分:0)
我正在使用ctree的示例,但这基本上是聪明地使用因子,因此可以在严格依赖因子水平的任何算法(RandomForest等)中使用
这都是关于了解R如何存储和使用因子水平的所有信息。如果我们使用火车数据中相同的因子水平(并且在同一订单中)(即使没有测试数据也可以),我们将使用预先训练的ctree模型进行预测。
实际上,不需要使用ctree(party)软件包进行俱乐部训练和测试数据来进行预测。这是因为使用预训练的模型时,在运行时生产期间可能没有那么多的内存和处理器功能。预先训练的模型使我们摆脱了在生产环境中基于大量训练数据构建模型的负担。
第1步:在构建模型时,您可以在火车数据中存储每列的因子水平(只要适用)
var_list <- colnames(dtrain)
for(var in var_list)
{
if(class(dtrain[,var]) == 'character')
{
print(var)
#Fill blanks with "None" to keep the factor levels consistent
dtrain[dtrain[,var] == '',var] <- 'None'
col_name_levels <- unique(dtrain[,var])
#Make sure you have sorted the column levels
col_name_levels <- sort(col_name_levels, decreasing = FALSE)
#Make as factors
dtrain[,var] <- factor(dtrain[,var], levels = col_name_levels, ordered=TRUE)
print(levels(dtrain[,var]))
#This is the trick: Store the exact levels in a CSV which is much easier to load than the whole train data later in prediction phase
write.csv(levels(dtrain[,var]), paste0(getwd(),'/Output CSVs/',var,'_levels.csv'), row.names = FALSE)
}
}
# also store the column names and data types for detecting later
for(col_name in colnames(dtrain))
{
abc <- data.frame('col_name' = col_name,'class_colname' = paste(class(dtrain[,col_name]), collapse = ' '))
if(!exists('col_name_type_list'))
{
col_name_type_list <- abc
}else
{
col_name_type_list <- rbind(col_name_type_list, abc)
}
}
#Store for checking later
write.csv(col_name_type_list, filepath, row.names = FALSE)
然后在预测阶段(在生产环境中),只需读取测试数据中每一列的级别,丢弃具有新数据的行(ctree仍然无法对其进行预测),然后将这些行用于预测。
###############Now in test prediction ###########################
#Read the column list of train data (stored earlier)
col_name_type_list_dtrain <- read.csv( filepath, header = TRUE)
for(i in 1:nrow(col_name_type_list_dtrain))
{
col_name <- col_name_type_list_dtrain[i,]$col_name
class_colname <- col_name_type_list_dtrain[i,]$class_colname
if(class_colname == 'numeric')
{
dtest[,col_name] <- as.numeric(dtest[,col_name])
}
if(class_colname == 'ordered factor')
{
#Now use the column factor levels from train
remove(col_name_levels)
col_name_levels <- read.csv( paste0(getwd(),'/Output CSVs/',var,'_levels.csv'), header = TRUE)
factor_check_flag <- TRUE
col_name_levels <- as.character(col_name_levels$x)
print(col_name)
print('Pre-Existing levels detected')
print(NROW(col_name_levels))
#Drop new rows which are not in train; the model cant predict for them
rows_before_dropping <- nrow(dtest)
print('Adjusting levels to train......')
dtest <- dtest[dtest[,col_name] %in% col_name_levels,]
rows_after_dropping <- nrow(dtest)
cat('\nDropped Rows for adjusting ',col_name,': ',(rows_before_dropping - rows_after_dropping),'\n')
#Convert to factors
dtest[,col_name] <- factor(dtest[,col_name], levels=col_name_levels, ordered=TRUE)
print(dtest[,col_name])
}
}