使用H2O R包中的h2o.anomaly函数重构MSE计算

时间:2019-03-04 01:23:45

标签: r h2o autoencoder anomaly-detection

我正在尝试执行自动编码器以进行异常检测。我使用H2O R包,使用h2o.anomaly函数为示例数据生成了重建MSE。但是,我也尝试根据以下文档链接中的MSE公式由我自己手动进行计算: http://docs.h2o.ai/h2o/latest-stable/h2o-docs/performance-and-prediction.html#mse-mean-squared-error

我用来构建模型的包含三个特征和5行的训练数据如下:

head(train_dat)

  Feature1  Feature2 Feature3
1    68.18 0.1806535 3.871201
2    71.51 0.3987761 2.484907
3    67.77 0.4285304 3.332205
4    69.58 0.1823216 2.890372
5    70.98 0.4134333 1.791759

我用于预测的包含三个特征和5行的测试数据如下:

head(test_dat)

  Feature1  Feature2 Feature3
1 68.33000 0.4350239 2.708050
2 73.98000 0.5550339 3.044522
3 67.11000 0.7323679 2.639057
4 69.90395 0.9999787 4.499810
5 71.28867 0.4882539 3.091042

经过训练和预测后,重建后的特征如下:

head(mod.out)

  reconstr_Feature1 reconstr_Feature2 reconstr_Feature3
1          69.66297         0.4239244          2.346250
2          69.88329         0.3963843          2.381598
3          69.46544         0.4610502          2.233164
4          68.96117         0.4229165          2.676295
5          69.63208         0.3895452          2.530025

当我使用h2o.anomaly函数进行MSE计算时,收到的MSE输出如下:

head(mse.list)

  Reconstruction.MSE
1         0.05310159
2         0.57037600
3         0.54427385
4         2.08407248
5         0.14251951

但是,当我尝试通过应用下面的函数来计算MSE时,我获得了不同的MSE输出:

mod.anon.validate <- apply((test_dat - mod.out)^2, 1, mean)
mse.list.validate <- as.data.frame(mod.anon.validate)
head(mse.list.validate)

  mod.anon.validate
1         0.6359438
2         5.7492281
3         1.9288268
4         1.5156829
5         1.0229217

我想知道手动MSE计算中做错了什么吗?当它被称为“重建MSE”时,它与一般的MSE有何不同?完整的R脚本如下:

### H2O Autoencoder test run ###

#Load test and training data.
test_dat <- read.table("sample.test.dat", header=TRUE)
train_dat <- read.table("sample.train.dat", header=TRUE)

#Start H2O
library(h2o)
localH2O <- h2o.init(port =54321)

#Training and deep learning

feature_names <- names(train_dat[1:3])

unmod.hex <- as.h2o(train_dat, destination_frame="train.hex") ; mod.hex=as.h2o(test_dat, destination_frame="test.hex")

unmod.dl <- h2o.deeplearning(x=feature_names,
        training_frame=unmod.hex,
        autoencoder = TRUE,
        reproducible = T,
        hidden = c(3,2,3), epochs = 50,
        activation = "Tanh")

#Output result

mod.out <- as.data.frame(h2o.predict(unmod.dl,mod.hex,type=response))

mod.anon <- h2o.anomaly(unmod.dl, mod.hex, per_feature=FALSE)
mse.list <- as.data.frame(mod.anon)

mod.anon.validate <- apply((test_dat - mod.out)^2, 1, mean)
mse.list.validate <- as.data.frame(mod.anon.validate)

感谢您的帮助。

2 个答案:

答案 0 :(得分:2)

计算不匹配,因为MSE是在归一化空间中计算的。如果您在standardize=FALSE中设置了h2o.deeplearning()参数,它将匹配:

unmod.dl <- h2o.deeplearning(x=feature_names, standardize = FALSE,
                             training_frame=unmod.hex,
                             autoencoder = TRUE,
                             reproducible = T,
                             hidden = c(3,2,3), epochs = 50,
                             activation = "Tanh")

mod.out <- as.data.frame(h2o.predict(unmod.dl, mod.hex, type=response))

mod.anon <- h2o.anomaly(unmod.dl, mod.hex, per_feature=FALSE)
mse.list <- as.data.frame(mod.anon)
mse.list

> mse.list
  Reconstruction.MSE
1           1512.740
2           1777.491
3           1458.438
4           1587.593
5           1648.999

> mod.anon.validate <- apply((test_dat - mod.out)^2, 1, mean)
> mse.list.validate <- as.data.frame(mod.anon.validate)
> mse.list.validate
  mod.anon.validate
1          1512.740
2          1777.491
3          1458.438
4          1587.593
5          1648.999

答案 1 :(得分:2)

Here's an example of how to normalize:

#Load test and training data.
test_dat <- sample.test
train_dat <- sample.train

#Start H2O
library(h2o)
localH2O <- h2o.init(port =54321, strict_version_check = FALSE)

#Training and deep learning
feature_names <- names(train_dat[1:3])
unmod.hex <- as.h2o(train_dat, destination_frame="train.hex") 
mod.hex <- as.h2o(test_dat, destination_frame="test.hex")
unmod.dl <- h2o.deeplearning(x=feature_names,
                             training_frame=unmod.hex,
                             autoencoder = TRUE,
                             reproducible = T,
                             hidden = c(3,2,3), epochs = 50,
                             activation = "Tanh")

# Anomaly Detection
mod.anon <- h2o.anomaly(unmod.dl, mod.hex, per_feature=FALSE)
mse.list <- as.data.frame(mod.anon)

# Manual MSE
mod.out <- as.data.frame(h2o.predict(unmod.dl, mod.hex, type=response))

# Scale Output
s <- apply(train_dat, 2, max) - apply(train_dat, 2, min)
m <- apply(train_dat, 2, mean)

original_scaled <- t(apply(test_dat, 1, function(x) (x-m)/s))
recreate_scaled <- t(apply(mod.out, 1, function(x) (x-m)/s))

mod.anon.validate <- apply((original_scaled - recreate_scaled)^2, 1, mean)
mse.list.validate <- as.data.frame(mod.anon.validate)

# Compare Outputs
print(mse.list)
print(mse.list.validate)