Question

我正在使用neuralnet R包来预测手写数字。 MNIST database用于训练和测试此算法。以下是我使用的R代码：

# Importing the data into R
path <- "path_to_data_folder/MNIST_database_of_handwritten_digits/"  # Data can be downloaded from: http://yann.lecun.com/exdb/mnist/
to.read = file(paste0(path, "train-images-idx3-ubyte"), "rb")
to.read_Label = file(paste0(path, "train-labels-idx1-ubyte"), "rb")
magicNumber <- readBin(to.read, integer(), n=1, endian="big")
magicNumber_Label <- readBin(to.read_Label, integer(), n=1, endian="big")
numberOfImages <- readBin(to.read, integer(), n=1, endian="big")
numberOfImages_Label <- readBin(to.read_Label, integer(), n=1, endian="big")
rowPixels <- readBin(to.read, integer(), n=1, endian="big")
columnPixels <- readBin(to.read, integer(), n=1, endian="big")

# image(1:rowPixels, 1:columnPixels, matrix(readBin(to.read, integer(), n=(rowPixels*columnPixels), size=1, endian="big"), rowPixels, columnPixels)[,columnPixels:1], col=gray((0:255)/255))

trainDigits <- NULL
trainDigits <- vector(mode="list", length=numberOfImages)
for(i in 1:numberOfImages)
  trainDigits[[i]] <- as.vector(matrix(readBin(to.read, integer(), n=(rowPixels*columnPixels), size=1, endian="big"), rowPixels, columnPixels)[,columnPixels:1])

trainDigits <- t(data.frame(trainDigits))  # Takes a minute
trainDigits <- data.frame(trainDigits, row.names=NULL)

# i <- 1  # Specify the image number to visualize the image
# image(1:rowPixels, 1:columnPixels, matrix(trainDigits[i,], rowPixels, columnPixels), col=gray((0:255)/255))

trainDigits_Label <- NULL
for(i in 1:numberOfImages_Label)
  trainDigits_Label <- c(trainDigits_Label, readBin(to.read_Label, integer(), n=1, size=1, endian="big"))

# appending the labels to the training data
trainDigits <- cbind(trainDigits, trainDigits_Label)

#################### Modelling ####################

library(neuralnet)
# Considering only 500 rows for training due to time and memory constraints
myNnet <- neuralnet(formula = as.formula(paste0("trainDigits_Label ~ ", paste0("X",1:(ncol(trainDigits)-1), collapse="+"))),
                                data = trainDigits[1:500,], hidden = 10, algorithm='rprop+', learningrate=0.01)

#################### Test Data ####################

to.read_test = file(paste0(path, "t10k-images-idx3-ubyte"), "rb")
to.read_Label_test = file(paste0(path, "t10k-labels-idx1-ubyte"), "rb")
magicNumber <- readBin(to.read_test, integer(), n=1, endian="big")
magicNumber_Label <- readBin(to.read_Label_test, integer(), n=1, endian="big")
numberOfImages_test <- readBin(to.read_test, integer(), n=1, endian="big")
numberOfImages_Label_test <- readBin(to.read_Label_test, integer(), n=1, endian="big")
rowPixels <- readBin(to.read_test, integer(), n=1, endian="big")
columnPixels <- readBin(to.read_test, integer(), n=1, endian="big")

testDigits <- NULL
testDigits <- vector(mode="list", length=numberOfImages_test)
for(i in 1:numberOfImages_test)
  testDigits[[i]] <- as.vector(matrix(readBin(to.read_test, integer(), n=(rowPixels*columnPixels), size=1, endian="big"), rowPixels, columnPixels)[,columnPixels:1])

testDigits <- t(data.frame(testDigits))  # Takes a minute
testDigits <- data.frame(testDigits, row.names=NULL)

testDigits_Label <- NULL
for(i in 1:numberOfImages_Label_test)
  testDigits_Label <- c(testDigits_Label, readBin(to.read_Label_test, integer(), n=1, size=1, endian="big"))

#################### 'neuralnet' Predictions ####################

predictOut <- compute(myNnet, testDigits)
table(round(predictOut$net.result), testDigits_Label)

#################### Random Forest ####################
# Cross-validating NN results with Random Forest

library(randomForest)
myRF <- randomForest(x=trainDigits[,-ncol(trainDigits)], y=as.factor(trainDigits_Label), ntree=100)

predRF <- predict(myRF, newdata=testDigits)
table(predRF, testDigits_Label)  # Confusion Matrix
sum(diag(table(predRF, testDigits_Label)))/sum(table(predRF, testDigits_Label))  # % of correct predictions

有60,000个训练图像（28 * 28像素图像），数字0到9在整个数据集中（几乎）均匀分布。与上面仅使用500个图像的“建模”部分不同，我使用整个训练数据集来训练myNnet模型（28 * 28 = 784个输入和10个输出），然后预测10,000个图像的输出在测试数据集中。（由于内存限制，我在隐藏层中只使用了10个神经元。）

我通过预测获得的结果很奇怪：输出是一种高斯分布，其中大部分时间预测为4，而4中0或9的预测以指数方式减少（种类）。您可以在下面看到混淆矩阵（我将输出四舍五入，因为它们不是整数）：

> table(round(predictOut$net.result), testDigits_Label)
    testDigits_Label
       0   1   2   3   4   5   6   7   8   9
  -2   1   1   4   1   1   3   0   4   1   2
  -1   8  17  12   9   7   8   8  12   7  10
  0   38  50  44  45  35  28  36  40  30  39
  1   77 105  86  80  71  69  68  75  67  77
  2  116 163 126 129 101  97 111 101  99 117
  3  159 205 196 174 142 140 153 159 168 130
  4  216 223 212 183 178 170 177 169 181 196
  5  159 188 150 183 183 157 174 176 172 155
  6  119 111 129 125 143 124 144 147 129 149
  7   59  53  52  60  74  52  51  91  76  77
  8   22  14  18  14  32  36  28  38  35  41
  9    6   5   3   7  15   8   8  16   9  16

我认为我的方法肯定有问题，所以我尝试使用randomForest R包进行预测。但是，randomForest工作正常，准确度超过95％。以下是randomForest预测的混淆矩阵：

> table(predRF, testDigits_Label)
      testDigits_Label
predRF    0    1    2    3    4    5    6    7    8    9
     0  967    0    6    1    1    7   11    2    5    5
     1    0 1123    0    0    0    1    3    7    0    5
     2    1    2  974    9    3    1    3   25    4    2
     3    0    3    5  963    0   21    0    0    9   10
     4    0    0   12    0  940    1    4    2    7   15
     5    4    0    2   16    0  832    6    0   11    4
     6    6    5    5    0    7   11  929    0    3    2
     7    1    1   14    7    2    2    0  979    4    6
     8    1    1   12    7    5   11    2    1  917   10
     9    0    0    2    7   24    5    0   12   14  950

问题1：那么，任何人都可以解释一下为什么neuralnet对此数据集有这种奇怪的行为？（顺便说一句，当我检查时，neuralnet与iris数据集工作正常。
- 编辑： 我想我在使用neuralnet时理解输出中高斯分布的原因。当使用neuralnet时，每个输出类（这里是10个类）只有一个输出节点（或者它是神经元？）而不是节点。因此，在计算反向传播的 delta 时，算法计算“预期输出”与“计算输出”的差异，对于所有实例的聚合，对于那些实例的聚合最少。输出为4或5. 因此，在反向传播期间将调整权重，使输出误差最小化。这可能是neuralnet给出的高斯输出的原因。
问题2：我还想知道如何纠正neuralnet的这种行为，并获得与randomForest结果相同的预测。

Answer 1

一些初步建议，您可以更有效地加载数据：

# Read in data.
trainDigits <- replicate(numberOfImages,c(matrix(readBin(to.read, integer(), n=(rowPixels*columnPixels), size=1, endian="big"),rowPixels,columnPixels)[,columnPixels:1]))
trainDigits <- data.frame(t(trainDigits),row.names=NULL)
trainDigits_Label<-replicate(numberOfImages,readBin(to.read_Label, integer(), n=1, size=1, endian="big"))

您的第一个问题是您没有为neuralnet指定多类预测。你正在做的是预测一个实数，从0到9.这就是为什么只有一个输出，而不是10个预测。

如果你查看?neuralnet，就会有一个多类预测的例子;您必须将每个类放在一个单独的变量中，并将其放在formula的左侧。其他软件包（例如nnet）会自动检测factor并为您执行此操作。您可以使用classInd函数将因子拆分为多个变量：

# appending the labels to the training data
output <- class.ind(trainDigits_Label)
colnames(output)<-paste0('out.',colnames(output))
output.names<-colnames(output)
input.names<-colnames(trainDigits)
trainDigits<-cbind(output,trainDigits)

现在您可以将公式粘贴在一起：

# Considering only 500 rows
trainsize=500
# neuralnet:::varify.variables (sic) does not pass "data" when calling "terms".
# If it did, you wouldn't have to construct the formula like this.
library(neuralnet)
myNnet <- neuralnet(formula = paste(paste(output.names,collapse='+'),'~',
                              paste(input.names,collapse='+')),
                    data = trainDigits[1:trainsize,],
                    hidden = 10, 
                    algorithm='rprop+', 
                    learningrate=0.01,
                    rep=1)

校正仍然不能使神经网络表现良好。要了解神经网络有多糟糕，请查看它在训练数据上的表现。它应该是相当不错的，因为它之前已经看到了所有这些数据：

# Accuracy on training data
res<-compute(myNnet,trainDigits[1:trainsize,input.names])
picks<-(0:9)[apply(res$net.result,1,which.max)]
prop.table(table(trainDigits_Label[1:trainsize] == picks))
# FALSE  TRUE 
# 0.376 0.624

62％的准确率在训练数据上很糟糕。正如您所料，它在其余数据上几乎不会随机执行：

# Accuracy on test data
res<-compute(myNnet,trainDigits[(trainsize+1):60000,input.names])
picks<-(0:9)[apply(res$net.result,1,which.max)]
prop.table(table(trainDigits_Label[(trainsize+1):60000] == picks))
# FALSE         TRUE 
# 0.8612268908 0.1387731092 
# 14% accuracy

随机森林使用完全相同的数据做得非常好。最近它变得如此受欢迎有一个很好的理由。

trainsize=500
library(randomForest)
myRF <- randomForest(trainDigits_Label~.,
                     data=data.frame(trainDigits_Label=as.factor(trainDigits_Label),
                                     trainDigits[input.names])[1:trainsize,],
                     ntree=100)

# Train
p <- as.numeric(as.character(predict(myRF)))
prop.table(table(trainDigits_Label[1:trainsize]==p))
# Accuracy: 79%    

# Test
p <- as.numeric(as.character(predict(myRF,trainDigits[(trainsize+1):60000,])))
prop.table(table(trainDigits_Label[(trainsize+1):60000]==p))
# Accuracy: 76%

所以，对于你的第二个问题，我的反问题是：为什么你会期望神经网络和随机森林一样？它们可能有一些模糊的结构相似性，但拟合过程却截然不同。我想你可以对神经网络中的节点进行挖掘，并将它们与随机森林模型中最重要的变量进行比较。但是，在这一点上，它更像是一个统计问题，而不是编程问题。

Answer 2

我要感谢所有以前的作者在本次讨论中，因为它是网上 neuralnet 包使用情况最丰富的资料来源！这个讨论非常有助于我学习 neuralnet R包。

关于问题2 ：可以使用 neuralnet 使用这些提示预测数字标签更准确：

使用更多神经元。隐藏层中的10个神经元是不够的。应该使用至少30个神经元。
在训练前将输入标准化并居中。阅读Max Kuhn＆＃34; Applied Predictive Modeling＆＃34;，第3章。
learningrate 参数仅用于＆＃34; backprop＆＃34;算法。对于其他算法（rprop +，sag，slr，...），请使用 learningrate.limit 和 learningrate.factor 参数。
使用更多培训数据。

有30个神经元，NN会给出：

[1] "NN to predict Labels."  
[1] "Confusion matrix for training set:"
        Expected   
Predicted   0   1   2   3   4   5   6   7   8   9
        0  96   0   0   0   0   0   0   0   0   0
        1   1 116   0   0   0   0   0   0   0   0
        2   0   0  99   0   0   0   0   0   0   0
        3   0   0   0  93   0   0   0   0   0   0
        4   0   0   0   0 104   1   0   0   0   0
        5   0   0   0   0   1  91   0   0   0   0
        6   0   0   0   0   0   0  94   0   0   0
        7   0   0   0   0   0   0   0 117   0   0
        8   0   0   0   0   0   0   0   0  87   0
        9   0   0   0   0   0   0   0   0   0 100
[1] "Model accuracy on training set is 99.7%"

[1] "Confusion matrix for test set:"
         Expected 
Predicted   0   1   2   3   4   5   6   7   8   9
        0 337 380 257 160  87  85  67  25  45  30
        1 134 169  97  77  60  64  70  32  41  16
        2 121 179 112 109  59  79  69  31  55  27
        3 119 136 138 114  99 102  96  67  66  55
        4  87 102  91 135 106 102 104  86  87  54
        5  84  75  95 114 114  91 142 104  82  66
        6  48  41  80  98 106 116 144 138 104  92
        7  22  28  55  82 103  78 100 146 104 124
        8  16   9  42  56  80  60  65 123  93 125
        9  12  16  65  65 168 115 101 276 297 420 
[1] "Model accuracy on test set is 17.32%"

测试结果远非好，但是混淆矩阵具有对角线形状，这意味着模型正在朝着正确的方向工作。通过使用训练集大小和阈值可以改善该模型的准确性。我的准确率达到了30％左右。但是这个模型是有限的，并且可以使用预测Label Class而不是Label的模型获得最佳结果。通过使用 neuralnet packge，我能够从shuch模型中获得大约80％的准确度。

有了30个神经元和1000个训练大小，这个NN给出了：

[1] "NN to predict Label Classes." 
[1] "Confusion matrix for training set:"
        Expected 
Predicted   0   1   2   3   4   5   6   7   8   9
        0  95   0   0   0   0   0   0   0   0   1
        1   0 113   0   0   0   0   0   1   0   0
        2   0   0  98   0   0   2   0   1   0   0
        3   1   2   0  93   1   0   0   1   0   0
        4   0   0   0   0 104   0   0   0   0   1
        5   1   1   0   0   0  90   0   1   0   0
        6   0   0   0   0   0   0  93   0   0   0
        7   0   0   0   0   0   0   0 112   0   0
        8   0   0   0   0   0   0   0   0  86   0
        9   0   0   1   0   0   0   1   1   1  98 
[1] "Model accuracy on training set is 98.2%" 
[1] "Confusion matrix for test set:"
        Expected 
Predicted    0    1    2    3    4    5    6    7    8    9
        0  791    0   32   28   11   62   12   25   20   22
        1    1 1050   13    4   10   10    2   13   31   35
        2   24    2  580   59    8   13   39   73   26   24
        3   42   14  105  607   79  112   74   68  106  124
        4   10   12   40   28  495   62   59   20   83   83
        5   39   31   25  126   35  444   71    6   54   22
        6   13    3   45    7   22   15  554    3   18   13
        7    4    4   31   11   37   10    7  732   11   66
        8   21    7   92   79   51   96   50   19  518   21
        9   35   12   69   61  234   68   90   69  107  599 
[1] "Model accuracy on test set is 63.7%"

#################### Importing the data into R ##########
#path <- "path_to_data_folder/MNIST_database_of_handwritten_digits/"  # Data can be downloaded from: http://yann.lecun.com/exdb/mnist/
path <- "../MNIST_DATA/UNZIP/"
to.read = file(paste0(path, "train-images.idx3-ubyte"), "rb")
to.read_Label = file(paste0(path, "train-labels.idx1-ubyte"), "rb")
magicNumber <- readBin(to.read, integer(), n=1, endian="big")
magicNumber_Label <- readBin(to.read_Label, integer(), n=1, endian="big")
numberOfImages <- readBin(to.read, integer(), n=1, endian="big")
numberOfImages_Label <- readBin(to.read_Label, integer(), n=1, endian="big")
rowPixels <- readBin(to.read, integer(), n=1, endian="big")
columnPixels <- readBin(to.read, integer(), n=1, endian="big")

trainDigits <- NULL

#Trick #1: read unsigned data
trainDigits <- replicate(numberOfImages,c(matrix(readBin(to.read, integer(), n=(rowPixels*columnPixels), 
                                                         size=1, endian="big", signed=F),
                                                 rowPixels,columnPixels)[,columnPixels:1]))
trainDigits <- data.frame(t(trainDigits),row.names=NULL)
trainDigits_Label<-replicate(numberOfImages,readBin(to.read_Label, integer(), n=1, size=1, endian="big", signed=F))
close(to.read)
close(to.read_Label)

#################### Test Data ####################

to.read_test = file(paste0(path, "t10k-images.idx3-ubyte"), "rb")
to.read_Label_test = file(paste0(path, "t10k-labels.idx1-ubyte"), "rb")
magicNumber <- readBin(to.read_test, integer(), n=1, endian="big")
magicNumber_Label <- readBin(to.read_Label_test, integer(), n=1, endian="big")
numberOfImages_test <- readBin(to.read_test, integer(), n=1, endian="big")
numberOfImages_Label_test <- readBin(to.read_Label_test, integer(), n=1, endian="big")
rowPixels <- readBin(to.read_test, integer(), n=1, endian="big")
columnPixels <- readBin(to.read_test, integer(), n=1, endian="big")

#read unsigned data 
testDigits <- replicate(numberOfImages_test,c(matrix(readBin(to.read, integer(), n=(rowPixels*columnPixels), 
                                                             size=1, endian="big", signed=F),
                                                     rowPixels,columnPixels)[,columnPixels:1]))
testDigits <- data.frame(t(testDigits),row.names=NULL)
testDigits_Label<-replicate(numberOfImages_test,readBin(to.read_Label_test, integer(), n=1, size=1, endian="big", signed=F))
close(to.read_test)
close(to.read_Label_test)

#################### Modelling ####################

library(neuralnet)

#add Label data to training data.frame
trainData <- cbind(trainDigits_Label, trainDigits)
names(trainData)[1] <- "Label"

#Reduce training data for speedup
trainSample <- 1000 #use more then 500 rows to get better model accuracy (slow!)
trainData <- trainData[1:trainSample,]
myThreshold <- trainSample/5000 #use smaller threshold to get better model accuracy (slow!)

#Trick #2: normalize and center pixel data before trainig and testing
normFactor <- max(trainData) #=255
trainData[,-1] <- trainData[,-1]/normFactor #normalize inputs
centerFactor <- mean(as.matrix(trainData[,-1])) #0.5 mean по столбцу? 
trainData[,-1] <- trainData[,-1]- centerFactor #center inputs
testDigits <- testDigits/normFactor - centerFactor

#Trick #3: use more neurons in the hidden layer to rise the model accuracy
nHidden=30

#train model which predicts Labels
myFormula <- as.formula(paste0("Label ~ ", paste0("X",1:(ncol(trainDigits)), collapse="+")))
myNnet <- neuralnet(formula = myFormula, data = trainData, hidden = c(nHidden), 
                    algorithm='rprop+', #learningrate=0.01,
                    learningrate.limit=list(min=c(1e-10), max=c(0.01)), #default values min/max = 1e-10/0.1
                    learningrate.factor=list(minus=c(0.5), plus=c(1.2)), #default values minus/plus = 0.5/1.2
                    err.fct="sse", #Using "sum square errors" function for Error
                    act.fct="tanh",#Using tangent hyperbolicus activation smoothing function 
                    threshold=myThreshold,
                    lifesign="full", lifesign.step=500,
                    stepmax=3e05)

#Trick #4: get rid of negative predictions. consider them to be equal to zero. 
#The same with too big predictions (>9)
myNnet$net.result[[1]][myNnet$net.result[[1]]<0]<-0
myNnet$net.result[[1]][myNnet$net.result[[1]]>9]<-9

#################### 'neuralnet' Predictions ####################

predictOut <- compute(myNnet, testDigits)
predictOut$net.result[predictOut$net.result<0] <- 0
predictOut$net.result[predictOut$net.result>9] <- 9

#################### Result analysis ####################

#Model accuracy on training data
confTrain <- table(Predicted=round(myNnet$net.result[[1]]), Expected=(trainData[,"Label"]))
print("NN to predict Labels.")
print("Confusion matrix for training set:")
print (confTrain)
print(paste0("Model accuracy on training set is ", round(sum(diag(confTrain))/sum(confTrain)*100,4), "%"))

#Model accuracy on test data
confTest <- table(Predicted=round(predictOut$net.result), Expected=testDigits_Label)
print("Confusion matrix for test set:")
print (confTest)
print(paste0("Model accuracy on test set is ", round(sum(diag(confTest))/sum(confTest)*100,4), "%"))



#########################################################################################
#Trick #5: Predict digit Class instead of predicting digit Label
#Replace each Label with a vector of 10 bits "Label classes"
library (nnet)

# appending the Label classes to the training data
output <- class.ind(trainData[,"Label"])
colnames(output)<-paste0('out.',colnames(output))
output.names<-colnames(output)
input.names<-colnames(trainData[,-1])
trainData <-cbind(output,trainData)

#train model which predicts Label classes
myFormula <- as.formula(paste0(paste0(output.names,collapse='+')," ~ ", 
                               paste0(input.names, collapse="+")))
myNnetClass <- neuralnet(formula = myFormula, data = trainData, hidden = c(nHidden), 
                  algorithm='sag', #learningrate=0.01,
                  learningrate.limit=list(min=c(1e-10), max=c(0.01)), #default values min/max = 1e-10/0.1
                  learningrate.factor=list(minus=c(0.5), plus=c(1.2)), #default values minus/plus = 0.5/1.2
                  err.fct="sse", #Using "sum square errors" function for Error
                  act.fct="tanh",#Using tangent hyperbolicus activation smoothing function 
                  threshold=myThreshold, 
                  lifesign="full", lifesign.step=500,
                  stepmax=3e05)


# Convert  binary output to categorical output (labels)
nnres=myNnetClass$net.result[[1]]
myNnetClass$net.result[[1]] <- (0:9)[apply(myNnetClass$net.result[[1]],1,which.max)]


#################### 'neuralnet' Predictions ####################

predictOutClass <- compute(myNnetClass, testDigits)
colnames(predictOutClass$net.result) <- paste0("Cl", 0:9)
predictedLabel <- (0:9)[apply(predictOutClass$net.result, 1, which.max)]

#################### Result analysis ####################

#Model accuracy on training data
confTrain <- table(Predicted=myNnetClass$net.result[[1]], Expected=trainData[,"Label"])
print("NN to predict Label Classes.")
print("Confusion matrix for training set:")
print (confTrain)
print(paste0("Model accuracy on training set is ", round(sum(diag(confTrain))/sum(confTrain)*100,4), "%"))

#Model accuracy on test data
confTest <- table(Predicted=predictedLabel, Expected=testDigits_Label)
print("Confusion matrix for test set:")
print (confTest)
print(paste0("Model accuracy on test set is ", round(sum(diag(confTest))/sum(confTest)*100,4), "%"))

R中使用'neuralnet'时出现意外输出

2 个答案: