
时间:2017-05-04 16:16:14

标签: r random-forest prediction




> trainDF
   X5fwd  Var1   Var2    Var3    Var4   Var5  Var6   Var7
3    0   139.50  9880.0  123.65  4180  206.0  1525  245.7477
4    1   137.90  9844.0  123.80  4190  206.0  1523  244.0442
7    1   139.80  9844.0  125.44  4190  204.0  1506  242.8183
9    1   142.50  9914.0  125.34  4200  204.0  1496  240.4294
10   1   143.50  9914.0  126.22  4210  204.0  1482  242.1602
11   1   143.10  9914.0  124.97  4210  204.0  1479  240.9533
14   1   143.70  9887.0  125.71  4200  205.0  1449  247.8008
15   1   143.70  9887.0  124.12  4190  205.0  1416  245.2176

> testDF[2:8]
    Var1  Var2   Var3   Var4  Var5  Var6  Var7
5   138.2  9844  121.98  4190  205  1516  240.96
17  143.8  9887  123.14  4220  205  1384  243.28
19  144.1  9827  125.65  4220  206  1354  246.72
21  144.5  9827  124.16  4240  202  1361  243.49
22  144.6  9827  122.39  4240  202  1386  243.11
24  147.1  9803  124.86  4240  202  1456  242.69
29  149.1  9785  120.18  4260  204  1561  242.74
30  148.9  9785  121.71  4280  204  1576  245.70
105 128.5  10045 106.20  3690  184  1200  229.83

> new.data
    Var1  Var2   Var3   Var4  Var5  Var6  Var7        
110  116  10050  109.55  3650  172  1214  230.31



system.time(for (x in 100:(d-5)) {
  VarDataFrame <- DataFrame[1:x,]
  new.data <- DataFrame[2:8][x+5,]

  for (i in 1:100) {
    ##create the test and train data set
    ind <- sample.split(Y = VarDataFrame$X5fwd, SplitRatio = 0.7)
    trainDF <- VarDataFrame[ind,]
    testDF <- VarDataFrame[!ind,]

    ##Fitting the model
    modelRandom <- randomForest(X5fwd~., data = trainDF, mtry = 2, ntree = 200
                        , importance = TRUE)

    ##predictions test set
    PredictionsWithClass <- predict(modelRandom, testDF[2:8], type = 'class')
    t <- table(predictions=PredictionsWithClass, actual = testDF$X5fwd)

    ##accuracy metric and prediction
    Accuracy[i,] <- sum(diag(t))/sum(t)
    Output[i,] <- predict(modelRandom, new.data, type = 'prob')

  ##Store Output in Matrix
  MeanAccuracy[x+5,] <- (mean(Accuracy))
  MeanOutput[x+5,] <- c(mean(Output[,1]), mean(Output[,2]))

  for(y in seq_along(x)) x[y] <- x[y]+1 



enter image description here enter image description here

enter image description here

编辑2: 以下是具有近似完整数据集的trainDF的分布,以及用于比较的new.data,以及拟合模型的重要性统计。

> new.data
        Var1    Var2    Var3    Var4   Var5  Var6   Var7  
1295    68.72   13289   50.46   3320   186   1668   140.0435834

> importance(modelRandom)
               0        1 MeanDecreaseAccuracy MeanDecreaseGini
Var1    24.43001 25.23817             41.53475         80.14703
Var2    24.31617 27.30719             39.10255         63.99267
Var3    19.63059 15.39908             33.67201         67.89350
Var4    19.56362 18.09402             33.88818         53.32912
Var5    18.34156 19.21491             32.38921         56.62850
Var6    21.89851 21.29452             31.52016         65.17518
Var7    15.21747 16.06692             27.01031         62.28704

enter image description here

0 个答案:
