Saving and loading a model in R
How to save my trained Random Forest model and apply it to test data files one by one?
我已经问过这个问题并阅读了上面提到的帖子。现在我在这里发布我的尝试和最少数据,试图解决我的问题。
示例数据的链接在这里:
https://drive.google.com/drive/folders/11GttkeW2A1peRIE2DmiB9WJxTp8wPlEo?usp=sharing
在这个文件夹中,有 16 个控制训练数据文件,子文件夹“testset”包含完全不同的新测试数据,我想将我训练的随机森林 (RF) 分类器应用于这些数据。
我想根据少数感兴趣的药物治疗选择一些文件(不同生物样本上的数百个文件)作为训练集来构建模型(RF 分类器),保存它,并在以下情况下将其用于新数据我需要它。
代码:
library(data.table)
library(caret)
library(caTools)
library(randomForest)
library(e1071)
library(FNN)
library(tidyverse)
#####read data files in the dir and random sample a percentage of rows in each file
txt_files_ls = list.files(pattern=c("*.txt") )
txt_files_df <- lapply(txt_files_ls, fread)
sampled_txt_files_df <- lapply(txt_files_df,function(x){
x[sample(1:nrow(x), ceiling(nrow(x) * 0.4)), 1:131]
})
combined_df <- rbindlist(sampled_txt_files_df)
fwrite(combined_df,"Sampled40percent.csv",row.names = FALSE)
##### I closed off R after the first piece to save memory space
#read in the merged file adn tidy it up for classifier buiding
#The idea later is to use "treatmentsum" as class labels to differentiate different treatment conditions
#Pid_treatmentsum contains finer details in each individual patient cell responses
df.raw <-fread("Sampled40percent.csv", header =TRUE,data.table = FALSE) %>%
select(c(18:131)) %>%
select(-c(3:4)) %>%
mutate(treatmentsum = factor(treatmentsum)) %>%
mutate(Pid_treatmentsum=factor(Pid_treatmentsum))->df
str(df)
###################################feature cleaning
#find nearZerovarance features
nzv <- nearZeroVar(df[-c(1,2)], saveMetrics= TRUE)
nzv.flagged <- nzv[nzv$nzv=="TRUE",]
#remove nzv features
df %>%
select(-row.names(nzv.flagged)) -> df.sanitized
#identify correlated features
df.Cor <- cor(df.sanitized[-c(1:2)])
highlyCor <- findCorrelation(df.Cor, cutoff = .95,verbose = TRUE)
corF <- names(df[highlyCor])
df.sanitized<- df.sanitized[ ,-highlyCor]
#define control groups for kmeans based on my own interest
unique(df.sanitized$treatmentsum)
predead <- subset(df.sanitized,df.sanitized$treatmentsum== "3S_DMSO_00_predead")
live <- subset(df.sanitized,df.sanitized$treatmentsum== "3S_DMSO_00_live")
lena <- subset(df.sanitized,df.sanitized$treatmentsum== "3S_Lenalidomide_1uM_none")
STS <- subset(df.sanitized,df.sanitized$treatmentsum== "3S_STSVEN_100nM+10nM_none")
#bind to list
mylist <- list(predead,live,lena,STS)
# use kmeans to subsample data points by over-clustering
Kmeans.list <- map(.x = mylist,
.f = ~kmeans(scale(.x[,-c(1:2)]),
centers =100,
nstart=25,
iter.max = 100)) %>%
purrr::set_names(c("predead", "live", "Lena", "STS"))
#bind all the centers into a list for mapping
Kmeans_centers <- map(Kmeans.list, ~.x$centers)
#use map2 to select 10 cells that are close to the centers in each cluster
y <- map2(.x = mylist,.y=Kmeans_centers,
.f=~get.knnx(scale(.x[,-c(1:2)]),.y, 10)) %>%
purrr::set_names(c("predead", "live", "Lena", "STS"))
#get the nn.index and select each datapoint out to form a lits
#bind all the centers into a list for mapping
y.nnindex <- map(y, ~.x$nn.index)%>%
purrr::set_names(c("predead", "live", "Lena", "STS"))
#subset by knn index
cl.list <- list()
for (i in 1:4) {
idx <- sort(y.nnindex[[i]])
cl.list[[i]] <- as.data.frame(cbind(idx, mylist[[i]][idx,]))
}
df.sampled.by.KNN <- do.call("rbind", cl.list)
write.csv(df.sampled.by.KNN, "4k representative samples each from 100 clusters .csv", row.names = F)
###### from here is the model training
df.sampled.by.KNN <- read.csv("70k representative samples each from 1000 clusters each phenotype by Kmeans then KNN.csv", header = T)
#going to train random forest
#splt datainto training and test
set.seed(123)
split <- sample.split(df.sampled.by.KNN$treatmentsum, SplitRatio = 0.8)
rftraining_set <- subset(df.sampled.by.KNN, split==TRUE)
rftest_set <- subset(df.sampled.by.KNN, split==FALSE)
#get feature importance
unique(as.factor(rftraining_set[,2]))
control <- trainControl(method="cv", number=10,verboseIter = TRUE, search = 'grid')
mtry <- sqrt(ncol(training_set))
tunegrid <- expand.grid(.mtry = mtry)
model <- train(x=scale(rftraining_set[,-c(1:3)]), y=as.factor(rftraining_set[,2]), data=rftraining_set, method = "rf", trControl = control,metric= "Accuracy", maximize = TRUE ,importance = TRUE, type="classification", ntree =500,tuneGrid=tunegrid,na.action = na.omit)
#upto here, the model works just fine
print(model)
prediction2 <- predict(model, rftest_set)
cm<-confusionMatrix(prediction2, as.factor(rftest_set[,2]), positive = "1")
cm <- as.table(cm)
cm<-as.data.frame(cm)
write.csv(cm, "Confusion matrix.csv", row.names = TRUE)
从这里开始,我认为我的模型经过训练并且工作正常,我应该将其保存/复制到测试集数据所在的位置,然后加载它,然后只需调用预测函数即可。
#problem starts here
saveRDS(model, "myrf.RData")
getwd()
setwd("C:/Users/mli/Desktop/Stack project/testset")
myrf <-load("myrf.RData")
我在加载模型时出错
Error in load("myrf.RData") :
bad restore file magic number (file may be corrupted) -- no data loaded
In addition: Warning messages:
1: In readChar(con, 5L, useBytes = TRUE) :
truncating string with embedded nuls
2: file ‘myrf.RData’ has magic number 'X'
Use of save versions prior to 2 is deprecated > )