Question

我之前问了一个关于同样问题的问题，但由于我的方法已经改变，我现在有不同的问题。

我目前的代码：

from sklearn import preprocessing
from openpyxl import load_workbook
import numpy as np
from numpy import exp, array, random, dot
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
#Set sizes
rowSize = 200
numColumns = 4

# read  from excel file
wb = load_workbook('python_excel_read.xlsx')
sheet_1 = wb["Sheet1"]

date = np.zeros(rowSize)
day = np.zeros(rowSize)
rain = np.zeros(rowSize)
temp = np.zeros(rowSize)
out = np.zeros(rowSize)

for i in range(0, rowSize):
    date[i] = sheet_1.cell(row=i + 1, column=1).value
    day[i] = sheet_1.cell(row=i + 1, column=2).value
    rain[i] = sheet_1.cell(row=i + 1, column=3).value
    temp[i] = sheet_1.cell(row=i + 1, column=4).value
    out[i] = sheet_1.cell(row=i + 1, column=5).value

train = np.zeros(shape=(rowSize,numColumns))
t_o = np.zeros(shape=(rowSize,1))

for i in range(0, rowSize):
    train[i] = [date[i], day[i], rain[i], temp[i]]
    t_o[i] = [out[i]]


X = train
# Output
y = t_o

X_train, X_test, y_train, y_test = train_test_split(X, y)

####Neural Net
nn = MLPRegressor(
    hidden_layer_sizes=(3,),  activation='relu', solver='adam', alpha=0.001, batch_size='auto',
    learning_rate='constant', learning_rate_init=0.01, power_t=0.5, max_iter=10000, shuffle=True,
    random_state=9, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True,
    early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
nn.fit(X_train, y_train.ravel())


y_pred = nn.predict(X_test)

###Linear Regression
# lm = LinearRegression()
# lm.fit(X_train,y_train)
# y_pred = lm.predict(X_test)

fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.scatter(X_test[:,0], y_pred, s=1, c='b', marker="s", label='real')
ax1.scatter(X_test[:,0], y_test, s=10, c='r', marker="o", label='NN Prediction')
plt.show()

#Calc MSE
mse = np.square(y_test-y_pred).mean()

print(mse)

此结果显示测试数据的预测非常糟糕。因为我是新手，我不确定这是我的数据，模型还是我的编码。基于该图，我认为模型对于数据是错误的（模型似乎预测接近线性或平方的东西，而实际数据似乎更加分散）

以下是一些数据点：格式化为一年中的一天（2月1日），工作日（1）/周末（0），下雨（1）/无下雨（0），F中的温度，出勤（这是输出）

2   0   0   51  1366
4   0   0   62  538
5   1   0   71  317
6   1   0   76  174
7   1   0   78  176
8   1   0   68  220
12  1   1   64  256
13  1   1   60  379
14  1   0   64  316
18  0   0   72  758
19  1   0   72  1038
20  1   0   72  405
21  1   0   71  326
24  0   0   74  867
26  1   1   68  521
27  1   0   71  381
28  1   0   72  343
29  1   1   68  266
30  0   1   57  479
31  0   1   57  717
33  1   0   70  542
34  1   0   73  220
35  1   0   74  360
36  1   0   79  444
42  1   0   78  534
45  0   0   80  1572
52  0   0   76  1236
55  1   1   64  689
56  1   0   69  726
59  0   0   67  1188
60  0   0   74  1140
61  1   1   63  979
62  1   1   62  657
63  1   0   67  687
64  1   0   72  615
67  0   0   80  1074
68  1   0   81  1261
71  1   0   83  1332
73  0   0   85  1259
74  0   0   86  1142
76  1   0   88  1207
77  1   1   78  1438
82  1   0   85  1251
83  1   0   83  1019
85  1   0   86  1178
86  0   0   92  1306
87  0   0   92  1273
89  1   0   93  1101
90  1   0   92  1274
93  0   0   83  1548
94  0   0   86  1318
96  1   0   83  1395
97  1   0   81  1338
98  1   0   75  1240
100 0   0   84  1335
102 0   0   83  931
103 1   0   87  746
104 1   0   91  746
105 1   0   81  600
106 1   0   72  852
108 0   1   87  1204
109 0   0   89  1191
110 1   0   90  769
111 1   0   88  642
112 1   0   86  743
114 0   1   75  1085
115 0   1   78  1109
117 1   0   84  871
120 1   0   96  599
123 0   0   93  651
129 0   0   74  1325
133 1   0   88  637
134 1   0   84  470
135 0   1   73  980
136 0   0   72  1096
137 0   0   83  792
138 1   0   87  565
139 1   0   84  501
141 1   0   88  615
142 0   0   79  722
143 0   0   80  1363
144 0   0   82  1506
146 1   0   93  626
147 1   0   94  415
148 1   0   95  596
149 0   0   100 532
150 0   0   102 784
154 1   0   99  514
155 1   0   94  495
156 0   1   87  689
157 0   1   94  931
158 0   0   97  618
161 1   0   92  451
162 1   0   97  574
164 0   0   102 898
165 0   0   104 746
166 1   0   109 587
167 1   0   109 465
174 1   0   108 514
175 1   0   109 572
179 0   0   107 811
181 1   0   104 423
182 1   0   103 526
184 0   1   97  849
185 0   0   103 852
189 1   0   106 728
191 0   0   101 577
194 1   0   105 511
198 0   1   101 616
199 0   1   97  1056
200 0   0   94  740
202 1   0   103 498
205 0   0   101 610
206 0   0   106 944
207 0   0   105 769
208 1   0   103 551
209 1   0   103 624
210 1   0   97  513
212 0   1   107 561
213 0   0   100 905
214 0   0   105 767
215 1   0   107 510
216 1   0   108 406
217 1   0   109 439
218 1   0   103 427
219 0   1   104 460
224 1   0   105 213
227 0   0   112 834
228 0   0   109 615
229 1   0   105 216
230 1   0   104 213
231 1   0   104 256
232 1   0   104 282
235 0   0   104 569
238 1   0   103 165
239 1   1   105 176
241 0   1   108 727
242 0   1   105 652
243 1   1   103 231
244 1   0   96  117
245 1   1   98  168
246 1   1   97  113
247 0   0   95  227
248 0   0   92  1050
249 0   0   101 1274
250 1   1   95  1148
254 0   0   99  180
255 0   0   104 557
258 1   0   94  228
260 1   0   95  133
263 0   0   100 511
264 1   1   89  249
265 1   1   90  245
267 1   0   101 390
272 1   0   100 223
273 1   0   103 194
274 1   0   103 150
275 0   0   95  224
276 0   0   92  705
277 0   1   92  504
279 1   1   77  331
281 1   0   89  268
284 0   0   95  566
285 1   0   94  579
286 1   0   95  420
288 1   0   93  392
289 0   1   94  525
290 0   1   86  670
291 0   1   89  488
294 1   1   74  295
296 0   0   81  314
299 1   0   88  211
301 1   0   84  246
303 0   1   76  433
304 0   0   80  216
307 1   1   80  275
308 1   1   66  319
312 0   0   80  413
313 1   0   78  278
316 1   0   74  305
320 1   1   57  323
324 0   0   76  220
326 0   0   77  461
327 1   0   78  510
331 0   0   60  1701
334 1   0   58  237
335 1   0   62  355
336 1   0   68  266
338 0   0   70  246
342 1   0   72  109
343 1   0   70  103
347 0   0   58  486
349 1   0   52  144
350 1   0   53  209
351 1   0   55  289
354 0   0   62  707
355 1   0   59  903
359 0   0   58  481
360 0   0   53  1342
364 1   0   57  1624

我总共拥有超过一千个数据点，但我并没有全部用于培训/测试。一个想法是我需要更多，另一个是我需要更多因素，因为温度/降雨/星期几不会影响出勤率。

这是情节：

我可以做些什么来使我的模型更准确并提供更好的预测？

由于

编辑：我添加了更多数据点和另一个因素。我似乎无法上传excel文件，所以我把数据放在这里，更好地解释了它的格式化

编辑：这是最新的代码：

from sklearn import preprocessing
from openpyxl import load_workbook
import numpy as np
from numpy import exp, array, random, dot
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import LeaveOneOut
#Set sizes
rowSize = 500
numColumns = 254

# read  from excel file
wb = load_workbook('python_excel_read.xlsx')
sheet_1 = wb["Sheet1"]

input = np.zeros(shape=(rowSize,numColumns))
out = np.zeros(rowSize)
for i in range(0, rowSize):
    for j in range(0,numColumns):
        input[i,j] = sheet_1.cell(row=i + 1, column=j+1).value
    out[i] = sheet_1.cell(row=i + 1, column=numColumns+1).value

output = np.zeros(shape=(rowSize,1))

for i in range(0, rowSize):
    output[i] = [out[i]]


X = input
# Output
y = output

print(X)
print(y)
y[y < 500] = 0
y[np.logical_and(y >= 500, y <= 1000)] = 1
y[np.logical_and(y > 1000, y <= 1200)] = 2
y[y > 1200] = 3

# Use cross-validation
#kf = KFold(n_splits = 10, random_state=0)
loo = LeaveOneOut()
# Try different models
clf = svm.SVC()
scaler = StandardScaler()
pipe = Pipeline([('scaler', scaler), ('svc', clf)])

accuracy = cross_val_score(pipe, X, y.ravel(), cv = loo, scoring = "accuracy")
print(accuracy.mean())

#y_pred = cross_val_predict(clf, X, y.ravel(), cv = kf)
#cm = confusion_matrix(y, y_pred)

这里是最新数据，其中包含尽可能多的功能。请注意，这是来自完整数据的随机样本：

Link to sample data

当前输出： 0.6230954290296712

我的最终目标是达到90％或更高的准确度......我不相信我能找到更多功能，但如果有帮助，我会继续收集尽可能多的功能

Answer 1

你的问题非常普遍，不过我有一些建议。您可以使用SVR并尝试不同的模型。 Personnaly，我会尝试RandomForests，MLPR，最后选择我会使用import numpy as np from sklearn.preprocessing import StandardScaler from sklearn.metrics import classification_report,confusion_matrix import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_predict from sklearn import svm from sklearn.model_selection import cross_val_score from sklearn.metrics import confusion_matrix from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.model_selection import LeaveOneOut import pandas as pd from sklearn.decomposition import PCA # read the data df = pd.read_excel('python_excel_read.xlsx', header = None) rows, cols = df.shape X = df.iloc[: , 0:(cols - 1)] y = df.iloc[: , cols - 1 ] print(X.shape) print(y.shape) y[y < 500] = 0 y[np.logical_and(y >= 500, y <= 1000)] = 1 y[np.logical_and(y > 1000, y <= 1200)] = 2 y[y > 1200] = 3 print(np.unique(y)) # We can apply PCA to reduce the dimensions of the data # pca = PCA(n_components=2) # pca.fit(X) # X = pca.fit_transform(X) # Use cross-validation #kf = KFold(n_splits = 10, random_state=0) loo = LeaveOneOut() # Try different models clf = svm.SVC(kernel = 'linear') scaler = StandardScaler() pipe = Pipeline([('scaler', scaler), ('svc', clf)]) accuracy = cross_val_score(pipe, X, y.ravel(), cv = loo, scoring = "accuracy") print(accuracy.mean()) #y_pred = cross_val_predict(clf, X, y.ravel(), cv = kf) #cm = confusion_matrix(y, y_pred)。

我修改了一下代码以显示一个简单的例子：

{{1}}

Sklearn NN回归出勤率预测

1 个答案: