嗨,我正在研究在LIME模型上使用R进行解释。当我运行此部分时,一切都很好。
# Library
library(tm)
library(SnowballC)
library(caTools)
library(RWeka)
library(caret)
library(text2vec)
library(lime)
# Importing the dataset
dataset_original = read.delim('Restaurant_Reviews.tsv', quote = '', stringsAsFactors = FALSE)
dataset_original$Liked = as.factor(dataset_original$Liked)
# Splitting the dataset into the Training set and Test set
set.seed(123)
split = sample.split(dataset_original$Liked, SplitRatio = 0.8)
training_set = subset(dataset_original, split == TRUE)
test_set = subset(dataset_original, split == FALSE)
#Create & clean corpus
#clean corpus function
clean_text <- function(text) {
corpus = VCorpus(VectorSource(text))
corpus = tm_map(corpus, content_transformer(tolower))
corpus = tm_map(corpus, removeNumbers)
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, removeWords, stopwords())
corpus = tm_map(corpus, stemDocument)
corpus = tm_map(corpus, stripWhitespace)
return(corpus)
}
#ngram function
BigramTokenizer <- function(x){NGramTokenizer(x, Weka_control(min=1,max=2))}
#create dtm
dtm <- function(text){
corpus = VCorpus(VectorSource(text))
dtm = DocumentTermMatrix(corpus, control = list(weighting=weightTfIdf, tokenize=BigramTokenizer))
dataset = as.data.frame(as.matrix(dtm))
dataset = dataset[,order(names(dataset))]
return(dataset)
}
#cleaning train & test text
for (i in seq(nrow(training_set))) {
training_set$clean_text[i] = as.character(clean_text(training_set$Review)[[i]])
print(i)
}
for (i in seq(nrow(test_set))) {
test_set$clean_text[i] = as.character(clean_text(test_set$Review)[[i]])
print(i)
}
#Create document term matrix
dataset_train <- dtm(training_set$clean_text)
dataset_test <- dtm(test_set$clean_text)
#Drop new words in test set & ensure same number of columns as train set
test_colname <- colnames(dataset_test)[colnames(dataset_test) %in% colnames(dataset_train)]
test_colname <- test_colname[!is.na(test_colname)] #Remove NA
new_test_colname <- colnames(dataset_train)[!(colnames(dataset_train) %in% test_colname)] #Columns in train not in test
dataset_test <- dataset_test[,test_colname]
dataset_test[new_test_colname] <- 0
dataset_test = dataset_test[,order(names(dataset_test))]
dataset_train = as.matrix(dataset_train)
dataset_test = as.matrix(dataset_test)
#xgboost caret model
set.seed(123)
model <- train(dataset_train, training_set$Liked, method="xgbTree")
predict(model, newdata=dataset_test)
但是,当我运行此部分时:
######
#LIME#
######
explainer <- lime(training_set$Review, model, preprocess = dtm)
explanation <- explain(training_set$Review[1], explainer, n_labels = 1, n_features = 5)
plot_features(explanation)
它说:
Error in predict.xgb.Booster(modelFit, newdata) :
Feature names stored in `object` and `newdata` are different!
在运行此程序之前,我确保我的火车和测试数据具有相同的列名和编号。我也环顾四周,发现我的问题与这篇文章相似,但是我仍然缺乏对此链接的理解。 R: LIME returns error on different feature numbers when it's not the case
我花了数周时间进行这项工作并在线搜索,但无济于事,因此,非常感谢您对我应该做的任何帮助或指导!
我的数据:
数据集:https://drive.google.com/file/d/1-pzY7IQVyB_GmT5dT0yRx3hYzOFGrZSr/view?usp=sharing
答案 0 :(得分:2)
将xgboost软件包从v0.6.xxx更新到v0.7.xxx时,我遇到了同样的问题。
我解决了这一问题,不仅要确保训练和测试集中的列名相同,而且要确保列的顺序相同。
希望这对您有用。
答案 1 :(得分:0)
以下是对我来说适用于同一问题的代码。您的clean_text和dtm函数中有一个小问题。您需要将语料库传递给dtm而不是原始文本;我将它们合并在一起。
dataset_original$Liked = as.factor(dataset_original$Liked)
# Splitting the dataset into the Training set and Test set
set.seed(123)
split = sample.split(dataset_original$Liked, SplitRatio = 0.8)
training_set = subset(dataset_original, split == TRUE)
test_set = subset(dataset_original, split == FALSE)
#ngram function
BigramTokenizer <- function(x){NGramTokenizer(x, Weka_control(min=1,max=2))}
#create dtm
dtm <- function(text){
corpus = VCorpus(VectorSource(text))
corpus = tm_map(corpus, content_transformer(tolower))
corpus = tm_map(corpus, removeNumbers)
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, removeWords, stopwords())
corpus = tm_map(corpus, stemDocument)
corpus = tm_map(corpus, stripWhitespace)
# pass corpus to dtm
dtm = DocumentTermMatrix(corpus, control = list(weighting=weightTfIdf,
tokenize=BigramTokenizer))
return(as.matrix(dtm))
}
#Create document term matrix
dataset_train <- dtm(training_set$Review)
dataset_test <- dtm(test_set$Review)
# same columns and same order for for both data
matrix_columns_same <- function(a,b) {
# a and b: two matrices
intersect_cols12 <- intersect(colnames(a),colnames(b))
result_matrix <- matrix(0, nrow = nrow(b), ncol = ncol(a))
rownames(result_matrix) <- rownames(b)
colnames(result_matrix) <- colnames(a)
result_matrix[,intersect_cols12] <- b[, intersect_cols12]
return(result_matrix)
}
dataset_test <- matrix_columns_same(dataset_train,dataset_test)
# from xgboost package
param <- list(max_depth = 3,
eta = 0.1,
objective = "binary:logistic",
eval_metric = "error",
nthread = 1)
model <-xgboost::xgb.train(
param,
xgb.DMatrix(dataset_train, label = as.numeric(training_set$Liked)-1),
nrounds = 50
)
predictions <- predict(model, dataset_test)
# text to explain
text_to_explain <- test_set$Review[1:4]
explainer <- lime(text_to_explain, model, preprocess = dtm)
explanation <- explain(text_to_explain, explainer, n_labels = 1, n_features = 3)
plot_features(explanation)
也请参见R Lime package for text data中的类似讨论。
# #这是使用您的数据的代码。它对我有用,如果再次遇到错误,请告诉我。
#
library(tm)
library(lime)
library(xgboost)
# read data
dataset_original = read.delim('./data/Restaurant_Reviews.tsv', quote = '',
stringsAsFactors = FALSE)
dataset_original$Liked = as.factor(dataset_original$Liked)
# removing docs with less words
nwords <- 5
docs_split <- lapply(strsplit(dataset_original$Review, " "), function(x){x[!x
==""]}) #docs to list of tokens
ind_len <- unlist(lapply(docs_split,function(d) length(d)))
ind_len <- which(ind_len>nwords)
dataset_original <- dataset_original[ind_len,]
groups <- levels(dataset_original$Liked)
# Splitting the dataset into the Training set and Test set
set.seed(123)
split = sample.split(dataset_original$Liked, SplitRatio = 0.8)
training_set = subset(dataset_original, split == TRUE)
test_set = subset(dataset_original, split == FALSE)
########################################
#ngram function
BigramTokenizer <- function(x){NGramTokenizer(x, Weka_control(min=1,max=2))}
#create dtm
dtm <- function(text){
corpus = VCorpus(VectorSource(text))
corpus = tm_map(corpus, content_transformer(tolower))
corpus = tm_map(corpus, removeNumbers)
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, removeWords, stopwords())
corpus = tm_map(corpus, stemDocument)
corpus = tm_map(corpus, stripWhitespace)
dtm = DocumentTermMatrix(corpus, control = list(weighting=weightTf,
tokenize=BigramTokenizer))
dtm = removeSparseTerms(dtm,0.99)
dtm <- as.matrix(dtm)
dtm <- as.data.frame(dtm)
return(dtm)
}
#Create document term matrix
dataset_train <- dtm(training_set$Review)
dataset_test <- dtm(test_set$Review)
colnames(dataset_train) <- gsub(" ","_",colnames(dataset_train))
colnames(dataset_test) <- gsub(" ","_",colnames(dataset_test))
########################################
matrix_columns_same <- function(a,b) {
# a and b: two matrices
intersect_cols12 <- intersect(colnames(a),colnames(b))
result_matrix <- matrix(0, nrow = nrow(b), ncol = ncol(a))
rownames(result_matrix) <- rownames(b)
colnames(result_matrix) <- colnames(a)
result_matrix[,intersect_cols12] <- b[, intersect_cols12]
return(result_matrix)
}
dataset_train <- as.matrix(dataset_train)
dataset_test <- as.matrix(dataset_test)
dataset_test <- matrix_columns_same(dataset_train,dataset_test)
# filter docs; make sure documents have at least one word
nword <- 0
ind <- which(rowSums(dataset_train)>nword)
dataset_train <- dataset_train[ind,]
training_set <- training_set[ind,]
ind <- which(rowSums(dataset_test)>nword)
dataset_test <- dataset_test[ind,]
test_set <- test_set[ind,]
########################################
# using xgboost package
param <- list(max_depth = 3,
eta = 0.1,
objective = "binary:logistic",
eval_metric = "error",
nthread = 1)
model <-xgboost::xgb.train(
param,
xgb.DMatrix(as.matrix(dataset_train), label =
as.numeric(training_set$Liked)-1),
nrounds = 50
)
predictions <- predict(model, as.matrix(dataset_test)) > 0.5
test_labels <- test_set$Liked==groups[2]
# Accuracy
caret::confusionMatrix(table(predictions,test_labels))
########################################
# lime
ind_tr <- sample(1:nrow(test_set),4,replace = F)
text_to_explain <- test_set$Review[ind_tr]
explainer <- lime(text_to_explain, model, preprocess = dtm,
bin_continuous=T, n_bins = 4, n_permutations = 5000)
explanation <- lime::explain(text_to_explain, explainer, n_labels = 1, n_features = 3)
plot_features(explanation, ncol=2)
答案 2 :(得分:0)
从xgboost模型进行预测时,我遇到了同样的问题。
就我而言,我在训练之前进行了sparse.model.matrix
转换。
varX=c('l8','l21','v8','v21','fa','fb')
f1=as.formula(paste0('rV','~',paste(varX,collapse='+')))
sparse_matrix=sparse.model.matrix(f1, data = rstac)
mod=xgboost(data=sparse_matrix,label=rV,...)
我在
中遇到了错误y=predict(mod,newdata=as.matrix(rstac[1:10,varX]))
Error in predict.xgb.Booster(mod, newdata = as.matrix(rstac[1:10, varX])) :
Feature names stored in `object` and `newdata` are different!
我可以在mod[[8]]
中看到模型中使用的功能:
mod[[8]]
[1] "(Intercept)" "l8" "l21" "v8"
[5] "v21" "fa" "fb"
(Intercept)
丢失。在工作前先做sparse.model.matrix
。
y=predict(mod,newdata=sparse.model.matrix(~.,rstac[1:10,varX]))
y
[1] 0.3290127 0.3290127 0.6757481 0.6667279 0.6668081 0.6668081 0.3290127 0.2944945 0.2944945 0.2944945
答案 3 :(得分:0)
我有完全一样的问题。对我来说,解决方案是确保lime :: lime仅包括预测列和NO RESPONSE COLUMN,而对石灰:: explain函数也是如此。
答案 4 :(得分:0)
我使用最新的xgboost版本1.1.1.1遇到的两件事
也请通过github源链接获取更多信息 https://github.com/dmlc/xgboost/pull/5940