使用石灰对Keras模型的特征/变量重要性

时间:2018-08-26 22:32:00

标签: r machine-learning keras lime

我有一个具有多类型输入数据的两类分类Keras模型,其中我根据1个连续输入和3个分类输入数据预测A和B类。在下面的虚拟示例中,continuous1,categorical1和categorical2是一维张量,categorical3是长度为num_index = 20的形状(样本,索引)的2D张量,并且是一维编码的。然后我想使用例如Lime来分析哪些数据输入对预测的贡献超过其他数据。我正在关注this教程,但是由于我有多种类型的输入数据,因此遇到了一些困难。

enter image description here

set.seed(666)

#Dummy data
dat <- data.frame(samples=c(rep(paste0("A_",1:9800)),rep(paste0("B_",9801:10000))))
dat$label <- c(rep("A",9800),rep("B",200))
dat$continuous1 <- c(rnorm(9800, 100, 45),rnorm(200, 0, 25))
dat$continuous1[dat$continuous1<0] <- 0
dat$categorical1 <- c(rep(1:100,98),rep(1:100,2))
dat$categorical2 <- c(rep(1:98,100),rep(99:100,100))

pool_A <- factor(1:15,levels=1:20)
pool_B <- factor(16:20,levels=1:20)
dat_categorical3 <- vector("list",10000)
names(dat_categorical3) <- c(as.character(dat[dat$label=="A",]$samples),as.character(dat[dat$label=="B",]$samples))
for(i in 1:9800){
  dat_categorical3[[i]] <- (as.numeric(table(sample(pool_A, 20, replace=T))) > 0) + 0L
}
for(i in 9801:10000){
  dat_categorical3[[i]] <- (as.numeric(table(sample(pool_B, 20, replace=T))) > 0) + 0L
}
dat_categorical3_tensor <- do.call(rbind,dat_categorical3)


## Split data for training-validating-testing

# As we have much fewer Bs than As, each partition has to have a more or less equal number of Bs(?)
training_Bs <- sample(x=as.character(dat[dat$label=="B",]$samples), size=67, replace=F)
validating_Bs <- sample(x=setdiff(as.character(dat[dat$label=="B",]$samples),training_Bs), size=67, replace=F)
testing_Bs <- setdiff(as.character(dat[dat$label=="B",]$samples),c(training_Bs,validating_Bs))

# We also parition the data containing As equally, though this could probably be done in e.g., 80-10-10
training_As <- sample(x=as.character(dat[dat$label=="A",]$samples), size=3267, replace=F)
validating_As <- sample(x=setdiff(as.character(dat[dat$label=="A",]$samples),training_As), size=3267, replace=F)
testing_As <- setdiff(as.character(dat[dat$label=="A",]$samples),c(training_As,validating_As))

# Put together
training_dat <- dat[dat$samples %in% c(training_As,training_Bs),]
training_categorical3_tensor <- dat_categorical3_tensor[rownames(dat_categorical3_tensor) %in% c(training_As,training_Bs),]
#training_categorical3_tensor <- as.matrix(data.frame(dat_categorical3_tensor[rownames(dat_categorical3_tensor) %in% c(training_As,training_Bs),]))
training_labels <- ifelse(training_dat$label=="A",0,1)
training_dat2 <- as.matrix(training_dat[,c(3:5)]) #use this

validating_dat <- dat[dat$samples %in% c(validating_As,validating_Bs),]
validating_categorical3_tensor <- dat_categorical3_tensor[rownames(dat_categorical3_tensor) %in% c(validating_As,validating_Bs),]
#validating_categorical3_tensor <- as.matrix(data.frame(dat_categorical3_tensor[rownames(dat_categorical3_tensor) %in% c(validating_As,validating_Bs),]))
validating_labels <- ifelse(validating_dat$label=="A",0,1) 
validating_dat2 <- as.matrix(validating_dat[,c(3:5)]) #use this

testing_dat <- dat[dat$samples %in% c(testing_As,testing_Bs),]
testing_categorical3_tensor <- dat_categorical3_tensor[rownames(dat_categorical3_tensor) %in% c(testing_As,testing_Bs),]
#testing_categorical3_tensor <- as.matrix(data.frame(dat_categorical3_tensor[rownames(dat_categorical3_tensor) %in% c(testing_As,testing_Bs),]))
testing_labels <- ifelse(testing_dat$label=="A",0,1)
testing_dat2 <- as.matrix(testing_dat[,c(3:5)]) #use this


## Keras model
library(keras)    

# Input layers
all_dat_input <- layer_input(shape = 3, dtype = 'float32', name = 'all_dat_input')
categorical3_indices_input <- layer_input(shape = 20, dtype = 'float32', name = 'categorical3_input')
input_tensor <- c(all_dat_input, categorical3_indices_input)

# Output layers
all_dat_out <- all_dat_input %>%
  layer_dense(units = 128, activation = 'relu') %>%
  layer_dropout(rate = 0.5) %>%
  layer_dense(units = 128, activation = 'relu') %>%
  layer_dropout(rate = 0.5)

categorical3_indices_out <- categorical3_indices_input %>%
  layer_dense(units = 128, activation = 'relu') %>%
  layer_dropout(rate = 0.5) %>%
  layer_dense(units = 128, activation = 'relu') %>%
  layer_dropout(rate = 0.5)

output_tensor <- layer_concatenate(c(all_dat_out, categorical3_indices_out)) %>%
  layer_dense(units = 128, activation = 'relu') %>%
  layer_dropout(rate = 0.5) %>%
  layer_dense(units = 128, activation = 'relu') %>%
  layer_dropout(rate = 0.5) %>%
  layer_dense(units=1, activation="sigmoid")

model <- keras_model(inputs=input_tensor, outputs=output_tensor)

# Compile
model %>% compile(
  optimizer = "rmsprop",
  loss = "binary_crossentropy",
  metrics = "accuracy"
)

# Fit
history <- model %>% fit(
  x=list(training_dat2,training_categorical3_tensor),
  y=training_labels,
  batch_size=256,
  epochs=20,
  validation_data=list(list(validating_dat2,validating_categorical3_tensor),validating_labels)
)

# Compare with put aside testing data
results <- model %>% evaluate(list(testing_dat2,testing_categorical3_tensor),testing_labels)

results
$loss
[2] 1.984114e-07
$acc
[2] 1

# Using the trained network to generate predictions on new data
predictions <- model %>% predict(list(testing_dat2,testing_categorical3_tensor))

head(predictions[3267:3332])
[2] 0.9999994 0.9999999 1.0000000 0.9999999 1.0000000 0.9999832    
# the network correctly identifies Bs as Bs with a confidence above >99%

由于我从lime::explain()收到的错误消息似乎表明某些不匹配/缺少变量名/标签,因此我创建并尝试了不同的输入(均产生相同的错误);

input_test1 <- data.frame(training_dat2,training_categorical3_tensor)
#input_test2 <- data.frame(all_dat_input=training_dat2,categorical3_indices_input=training_categorical3_tensor)
#input_test3 <- data.frame(all_dat_input=training_dat2, categorical3_input=training_categorical3_tensor)

## Feature/variable importance using Lime  
library(lime)

explainer <- lime(input_test1, model, bin_continuous = F) #try different input_test1/input_test2/input_test3

explanation <- explain(input_test1, explainer=explainer, n_labels=2, n_features = 4) ##try different input_test1/input_test2/input_test3

Error in py_call_impl(callable, dots$args, dots$keywords) : 
      ValueError: No data provided for "all_dat_input". Need data for each key in: ['all_dat_input', 'categorical3_input'] 

所有三个input_test都导致上述错误。我还尝试确保实际数据,特别是categorical3_tensor列名与input_test的名称(即X1, X2, X3等)相匹配,但这也无济于事。

training_categorical3_tensor <- as.matrix(data.frame(dat_categorical3_tensor[rownames(dat_categorical3_tensor) %in% c(training_As,training_Bs),]))
validating_categorical3_tensor <- as.matrix(data.frame(dat_categorical3_tensor[rownames(dat_categorical3_tensor) %in% c(validating_As,validating_Bs),]))
testing_categorical3_tensor <- as.matrix(data.frame(dat_categorical3_tensor[rownames(dat_categorical3_tensor) %in% c(testing_As,testing_Bs),]))

任何建议/帮助都将受到高度赞赏!

0 个答案:

没有答案