我想澄清一下我对训练有素的M5P模型结果的理解。我训练了一个M5P模型给了我一棵树,接着是4个线性模型。
M5 unpruned model tree:
(using smoothed linear models)
Value12 <= 2.266 :
| Value2 <= 1111.5 : LM1 (2/0.01%)
| Value2 > 1111.5 : LM2 (4/2.268%)
Value12 > 2.266 :
| Value3 <= 1544650 : LM3 (2/1.652%)
| Value3 > 1544650 : LM4 (2/92.017%)
LM num: 1
Value15 =
-0.0001 * Value2
+ 1.8377
LM num: 2
Value15 =
-0.0001 * Value2
+ 1.8181
LM num: 3
Value15 =
-0 * Value3
+ 1.7212
LM num: 4
Value15 =
-0 * Value3
+ 1.7093
Number of Rules : 4
为了确保我理解工作原理,我尝试使用决策树和引用的LM模型手动复制结果,但结果不符合预期。
我使用树来确定使用哪个LM模型,并且我按照LM模型中的说明执行了操作,结果不一样。这是正常的吗?
我使用的数据集:
Data_train<-structure(list(Value2 = c(610L, 1245L, 978L, 610L, 978L, 610L,
1727L, 1810L, 1805L, 1805L), Value3 = c(1544673L, 2206981L, 2512821L,
1544627L, 2512792L, 1524144L, 3415598L, 9205162L, 9182166L, 9182089L
), Value4 = c(12.1260004043579, 17.3250007629395, 19.7259998321533,
12.125, 19.7250003814697, 11.9650001525879, 26.8120002746582,
72.2610015869141, 72.0800018310547, 72.0790023803711), Value5 =
c(0.0817999988794327,
0.0856000036001205, 0.0828000009059906, 0.0817999988794327,
0.0828000009059906,
0.09009999781847, 0.145199999213219, 0.200299993157387, 0.200299993157387,
0.200200006365776), Value6 = c(2L, 1L, 2L, 2L, 2L, 2L, 4L, 4L,
4L, 4L), Value7 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
Value8 = c(4L, 4L, 4L, 4L, 4L, 4L, 22L, 36L, 36L, 36L), Value9 = c(1L,
1L, 2L, 1L, 2L, 1L, 8L, 6L, 6L, 6L), Value10 = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Value11 = c(0.958189010620117,
1, 0.925986051559448, 0.958268105983734, 0.926032960414886,
0.971082329750061, 0.471057742834091, 0.476771682500839,
0.47670641541481, 0.47671303153038), Value12 = c(3.27869,
0.80321, 2.04499, 3.27869, 2.04499, 3.27869, 2.31616, 2.20994,
2.21607, 2.21607), Value13 = c(1L, 0L, 1L, 1L, 1L, 1L, 2L,
3L, 3L, 3L), Value15 = c(1.33398258686066, 1.90592515468597,
2.17005920410156, 1.33387243747711, 2.1699492931366, 1.31627094745636,
0.353617042303085, 1.93668437004089, 1.93183350563049, 1.93180668354034
)), .Names = c("Value2", "Value3", "Value4", "Value5", "Value6",
"Value7", "Value8", "Value9", "Value10", "Value11", "Value12",
"Value13", "Value15"), row.names = c(NA, 10L), class = "data.frame")
以下是我用来训练模型的公式:
library(RWeka)
Data_modelUnPruned <- M5P(Value15 ~ Value6 + Value3 + Value4 + Value2 +
Value7 + Value8 + Value9 + Value10 + Value11 + Value12 + Value13, data =
Data_train, control = Weka_control(N = TRUE))
以下是添加预测列后的结果数据集:
Data_train_Results<-structure(list(Value2 = c(610L, 1245L, 978L, 610L, 978L,
610L,
1727L, 1810L, 1805L, 1805L), Value3 = c(1544673L, 2206981L, 2512821L,
1544627L, 2512792L, 1524144L, 3415598L, 9205162L, 9182166L, 9182089L
), Value4 = c(12.1260004043579, 17.3250007629395, 19.7259998321533,
12.125, 19.7250003814697, 11.9650001525879, 26.8120002746582,
72.2610015869141, 72.0800018310547, 72.0790023803711), Value5 =
c(0.0817999988794327,
0.0856000036001205, 0.0828000009059906, 0.0817999988794327,
0.0828000009059906,
0.09009999781847, 0.145199999213219, 0.200299993157387, 0.200299993157387,
0.200200006365776), Value6 = c(2L, 1L, 2L, 2L, 2L, 2L, 4L, 4L,
4L, 4L), Value7 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
Value8 = c(4L, 4L, 4L, 4L, 4L, 4L, 22L, 36L, 36L, 36L), Value9 = c(1L,
1L, 2L, 1L, 2L, 1L, 8L, 6L, 6L, 6L), Value10 = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Value11 = c(0.958189010620117,
1, 0.925986051559448, 0.958268105983734, 0.926032960414886,
0.971082329750061, 0.471057742834091, 0.476771682500839,
0.47670641541481, 0.47671303153038), Value12 = c(3.27869,
0.80321, 2.04499, 3.27869, 2.04499, 3.27869, 2.31616, 2.20994,
2.21607, 2.21607), Value13 = c(1L, 0L, 1L, 1L, 1L, 1L, 2L,
3L, 3L, 3L), Value15 = c(1.33398258686066, 1.90592515468597,
2.17005920410156, 1.33387243747711, 2.1699492931366, 1.31627094745636,
0.353617042303085, 1.93668437004089, 1.93183350563049, 1.93180668354034
), Model_Prediction = c(1.56039428073199, 1.74959163286097,
1.77758972532522, 1.57231876013397, 1.77758972532522, 1.57429264935954,
1.38009848913172, 1.71850280973615, 1.71877793206469, 1.71877793206469
)), .Names = c("Value2", "Value3", "Value4", "Value5", "Value6",
"Value7", "Value8", "Value9", "Value10", "Value11", "Value12",
"Value13", "Value15", "Model_Prediction"), row.names = c(NA,
10L), class = "data.frame")
以下是我用来尝试复制模型结果的代码,它基本上是Visual Basic中M5P模型的硬编码版本。
Public Function GetLM(Value2 As Long, Value3 As Long, Value4 As Double,
Value6 As Long, Value7 As Long, Value8 As Long, Value9 As Long, Value10 As
Long, Value11 As Double, Value12 As Double, Value13 As Long)
Dim lm As String
If Value12 <= 2.266 Then
If Value2 <= 1111.5 Then
lm = "LM1" '(2/0.019%)
Else
lm = "LM2" '(4/2.269%)
End If
Else
If Value3 <= 1544650 Then
lm = "LM3" '(2/1.652%)
Else
lm = "LM4" '(2/92.021%)
End If
End If
Select Case lm
Case "LM1"
GetLM = -0.0001 * Value2 _
+ 1.8377
Case "LM2"
GetLM = -0.0001 * Value2 _
+ 1.8181
Case "LM3"
GetLM = -0 * Value3 _
+ 1.7212
Case "LM4"
GetLM = -0 * Value3 _
+ 1.7093
Case Else
GetLM = 0
End Select
End Function
有人可以向我解释这应该如何运作吗?
非常感谢。