也许我只是不理解这个概念,但是我认为在对缺少数据的数据进行训练时,屏蔽层应该可以提高模型的性能。
作为学习不规则时间序列的实验,我建立了一个数据集(无需深入研究数据的构造方式,但是您可以查看生成的Xtrain, Ytrain, Xtest, Ytest
):
# some parameters to adjust data generation process
seqSize <- 5
Nfeature <- 2
N <- 50000
pTrain <- .8
missValueMarker <- -1000000
p.miss.samples <- 0.5
p.miss.rate <- 0.0001
minValues <- 3
epochs <- 20
# create some random data
createSeq <- function(xInit,t0=0,tn=2*pi,seqSize=5){
tstepSize <- (tn - t0) / seqSize
tseq <- sort(sample(seq(from=t0, to=tn, length.out=seqSize*200),seqSize))
ty <- tn + tstepSize
yseq1 <- sin(xInit*tseq)
y1 <- sin(xInit*ty)
yseq2 <- cos(xInit*tseq)
y2 <- cos(xInit*ty)
matrix(c(yseq1,y1,yseq2,y2,diff(c(tseq,ty)),NaN),byrow = TRUE,ncol=(seqSize+1),nrow=3)
}
dx <- seqSize
trainIndex <- sample(1:N,round(pTrain*N))
xInits <- rnorm(N)
xInitTrain <- xInits[trainIndex]
xInitTest <- xInits[-trainIndex]
D <- t(sapply(xInits,function(x)createSeq(x,seqSize=seqSize)))
dim(D) <- c(N,Nfeature+1,seqSize+1)
# insert some missing values
nValues <- prod(seqSize * Nfeature)
p.miss.seq <- dexp(1:nValues,p.miss.rate)/sum(dexp(1:nValues,p.miss.rate))
p.miss.seq
missIdx <- sample(1:N,round(p.miss.samples*N))
for(i in missIdx){
nMiss <- min(sample(1:nValues,size=1,prob=p.miss.seq),nValues-minValues)
if(nMiss < 1){
next()
}
for(j in 1:nMiss){
# if(nMiss>1){
# browser()
# }
missId1 <- sample(1:Nfeature,1)
missId2 <- sample(1:seqSize,1)
D[i,missId1,missId2] <- missValueMarker
}
}
# get some overview on the missing data
naCnt <- apply(D,1,function(x)sum(x==missValueMarker,na.rm = TRUE))
table(naCnt)
# devide into feature and labels and train and test part
X <- D[,1:3,1:seqSize]
Y <- D[,1:2,seqSize+1]
Xtrain <- X[trainIndex,,]
Ytrain <- Y[trainIndex,]
Xtest <- X[-trainIndex,,]
Ytest <- Y[-trainIndex,]
如上所述,不需要进行详细介绍,但是通过检查数据集可以看出,数据由X(3乘5)矩阵作为输入,两个标签作为输出组成。数据包含丢失的数据,由指定的值(在上面的示例中为-1000000)标识。 现在,当我设置两个相同的模型时,一个带有和不带有遮罩层的模型如下:
# build model with and without masking layer
require(keras)
m1 <- keras_model_sequential()
m1 %>%
layer_lstm(units = 50, return_sequences = TRUE , stateful = FALSE ,input_shape = dim(X)[-1]) %>%
layer_lstm(units = 50, return_sequences = FALSE, stateful = FALSE) %>%
layer_dense(units = 2)
m1 %>% compile(loss = 'mse', optimizer = 'adam',metrics = c("accuracy"))
m2 <- keras_model_sequential()
m2 %>%
layer_masking(input_shape = dim(X)[-1],mask_value = missValueMarker) %>%
layer_lstm(units = 50, return_sequences = TRUE, stateful = FALSE) %>%
layer_lstm(units = 50, return_sequences = FALSE, stateful = FALSE) %>%
layer_dense(units = 2)
m2 %>% compile(loss = 'mse', optimizer = 'adam',metrics = c("accuracy"))
学习数据时,我得到的结果大致相同:
cat('Training\n')
testLoss1 <- numeric(epochs)
testLoss2 <- numeric(epochs)
hist1 <- list()
hist2 <- list()
batch_size <- 100
for (i in 1:epochs) {
hist1[[i]] <- m1 %>% fit(Xtrain, Ytrain, batch_size = batch_size,
epochs = 1, verbose = 1, shuffle = FALSE)
hist2[[i]] <- m2 %>% fit(Xtrain, Ytrain, batch_size = batch_size,
epochs = 1, verbose = 1, shuffle = FALSE)
m1 %>% reset_states()
m2 %>% reset_states()
predX1 <- m1 %>% predict(Xtest, batch_size = batch_size)
predX2 <- m2 %>% predict(Xtest, batch_size = batch_size)
testLoss1[i] <- mean((Ytest - predX1)^2)
testLoss2[i] <- mean((Ytest - predX2)^2)
#batch_size <- batch_size +5
}
windows()
par(mfrow=c(2,1))
trainLosses1 <- sapply(hist1,function(x)x$metrics$loss)
trainLosses2 <- sapply(hist2,function(x)x$metrics$loss)
allLosses <- c(trainLosses1,testLoss1,trainLosses2,testLoss2)
plot(trainLosses1,ylim=range(allLosses),type="b",main="w.o. mask layer")
points(testLoss1,type="b",col="red")
plot(trainLosses2,ylim=range(allLosses),type="b",main="w mask layer")
points(testLoss2,type="b",col="red")
这种行为是否可以预期?我的代码有问题吗?难道掩盖层根本没有作用吗?或者这是如何解释的? 关于这个话题的任何想法都会对我有帮助。