我正致力于深度学习。我用NSL-KDD数据集创建了一个自动编码器网络。我想以真实的方式训练网络。但在火车期间,我的损失函数显示了我无法理解的怪异数字。 这是我的源代码:
import os
import numpy as np
from keras.layers import Dense, Input
from keras.models import Model
def load_nsl_kdd_cup_dataset(filename):
with open(filename, 'r')as f:
list1 = []
list2 = []
for i in f:
list1.append(i)
for i in list1:
i = str(i)
labels = i.split(sep=',')
list2.append(labels[:41])
arraylist = np.array(list2)
return arraylist
dirs = './NSL_KDD-master'
train_X = load_nsl_kdd_cup_dataset(os.path.join(dirs,'KDDTrain+.txt'))
test_X = load_nsl_kdd_cup_dataset(os.path.join(dirs, 'KDDTest+.txt'))
print(train_X.shape)
print(test_X.shape)
预处理阶段1。 将名义值转换为数值
def convert_nominal_to_numerical(sets):
list1 = []
col1, col2, col3 = [], [], []
for i in sets:
# print(i[1])
if i[1] == 'tcp':
i[1] = 1
elif i[1] == 'udp':
i[1] = 2
elif i[1] == 'icmp':
i[1] = 3
if i[2] == 'http':
i[2] = 1
elif i[2] == 'private':
i[2] = 2
elif i[2] == 'domain_u':
i[2] = 3
elif i[2] == 'smtp':
i[2] = 4
elif i[2] == 'ftp_data':
i[2] = 5
elif i[2] == 'eco_i':
i[2] = 6
elif i[2] == 'other':
i[2] = 7
elif i[2] == 'ecr_i':
i[2] = 8
elif i[2] == 'telnet':
i[2] = 9
elif i[2] == 'finger':
i[2] = 10
elif i[2] == 'ftp':
i[2] = 11
elif i[2] == 'auth':
i[2] = 12
elif i[2] == 'Z39_50':
i[2] = 13
elif i[2] == 'uucp':
i[2] = 14
elif i[2] == 'courier':
i[2] = 15
elif i[2] == 'bgp':
i[2] = 16
elif i[2] == 'whois':
i[2] = 17
elif i[2] == 'uucp_path':
i[2] = 18
elif i[2] == 'iso_tsap':
i[2] = 19
elif i[2] == 'time':
i[2] = 20
elif i[2] == 'imap4':
i[2] = 21
elif i[2] == 'nnsp':
i[2] = 22
elif i[2] == 'vmnet':
i[2] = 23
elif i[2] == 'urp_i':
i[2] = 24
elif i[2] == 'domain':
i[2] = 25
elif i[2] == 'ctf':
i[2] = 26
elif i[2] == 'csnet_ns':
i[2] = 27
elif i[2] == 'supdup':
i[2] = 28
elif i[2] == 'discard':
i[2] = 29
elif i[2] == 'http_443':
i[2] = 30
elif i[2] == 'daytime':
i[2] = 31
elif i[2] == 'gopher':
i[2] = 32
elif i[2] == 'efs':
i[2] = 33
elif i[2] == 'systat':
i[2] = 34
elif i[2] == 'link':
i[2] = 35
elif i[2] == 'exec':
i[2] = 36
elif i[2] == 'hostnames':
i[2] = 37
elif i[2] == 'name':
i[2] = 38
elif i[2] == 'mtp':
i[2] = 39
elif i[2] == 'echo':
i[2] = 40
elif i[2] == 'klogin':
i[2] = 41
elif i[2] == 'login':
i[2] = 42
elif i[2] == 'ldap':
i[2] = 43
elif i[2] == 'netbios_dgm':
i[2] = 44
elif i[2] == 'sunrpc':
i[2] = 45
elif i[2] == 'netbios_ssn':
i[2] = 46
elif i[2] == 'netstat':
i[2] = 47
elif i[2] == 'netbios_ns':
i[2] = 48
elif i[2] == 'ssh':
i[2] = 49
elif i[2] == 'kshell':
i[2] = 50
elif i[2] == 'nntp':
i[2] = 51
elif i[2] == 'pop_3':
i[2] = 52
elif i[2] == 'sql_net':
i[2] = 53
elif i[2] == 'IRC':
i[2] = 54
elif i[2] == 'ntp_u':
i[2] = 55
elif i[2] == 'rje':
i[2] = 56
elif i[2] == 'remote_job':
i[2] = 57
elif i[2] == 'pop_2':
i[2] = 58
elif i[2] == 'X11':
i[2] = 59
elif i[2] == 'printer':
i[2] = 60
elif i[2] == 'shell':
i[2] = 61
elif i[2] == 'urh_i':
i[2] = 62
elif i[2] == 'tim_i':
i[2] = 63
elif i[2] == 'red_i':
i[2] = 64
elif i[2] == 'pm_dump':
i[2] = 65
elif i[2] == 'tftp_u':
i[2] = 66
elif i[2] == 'http_8001':
i[2] = 67
elif i[2] == 'aol':
i[2] = 68
elif i[2] == 'harvest':
i[2] = 69
elif i[2] == 'http_2784':
i[2] = 70
if i[3] == 'SF':
i[3] = 1
elif i[3] == 'S0':
i[3] = 2
elif i[3] == 'REJ':
i[3] = 3
elif i[3] == 'RSTR':
i[3] = 4
elif i[3] == 'RSTO':
i[3] = 5
elif i[3] == 'S1':
i[3] = 6
elif i[3] == 'SH':
i[3] = 7
elif i[3] == 'S2':
i[3] = 8
elif i[3] == 'RSTOS0':
i[3] = 9
elif i[3] == 'S3':
i[3] = 10
elif i[3] == 'OTH':
i[3] = 11
list1.append(i)
col1.append(i[1])
col2.append(i[2])
col3.append(i[3])
return list1, col1, col2, col3
预处理阶段2。 在每列中查找最小值和最大值。
def min_max_finder(col):
minimum_col = 10
maximum_col = 1
for i in col:
if float(i) < minimum_col:
minimum_col = float(i)
if float(i) > maximum_col:
maximum_col = float(i)
return minimum_col, maximum_col
预处理阶段3。 执行标准化。
def data_normalizer(col):
mini, maxi = min_max_finder(col)
normalized_col = [(float(x) - mini) / maxi for x in col]
return normalized_col
预处理阶段4。 用原件替换标准化数据。
def replace_normalized_with_original(normalized_col, which_col, data):
for i in range(len(data)):
data[i][which_col] = normalized_col[i]
return data
def convert_list_to_array(list1):
arrays = np.array(list1)
return arrays
list1, col1, col2, col3 = convert_nominal_to_numerical(train_X)
min_col1, max_col1 = min_max_finder(col=col1)
min_col2, max_col2 = min_max_finder(col=col2)
min_col3, max_col3 = min_max_finder(col=col3)
normalized_col1 = data_normalizer(col=col1)
normalized_col2 = data_normalizer(col=col2)
normalized_col3 = data_normalizer(col=col3)
list1 = replace_normalized_with_original(normalized_col=normalized_col1,which_col=1, data=train_X)
list1 = replace_normalized_with_original(normalized_col=normalized_col2, which_col=2, data=train_X)
list1 = replace_normalized_with_original(normalized_col=normalized_col3, which_col=3, data=train_X)
X_train_normalized = convert_list_to_array(list1)
list_test, col1_test, col2_test, col3_test = convert_nominal_to_numerical(test_X)
min_col1_test, max_col1_test = min_max_finder(col=col1_test)
min_col2_test, max_col2_test = min_max_finder(col=col2_test)
min_col3_test, max_col3_test = min_max_finder(col=col3_test)
normalized_col1_test = data_normalizer(col=col1_test)
normalized_col2_test = data_normalizer(col=col2_test)
normalized_col3_test = data_normalizer(col=col3_test)
list_test = replace_normalized_with_original(normalized_col=normalized_col1_test, which_col=1, data=test_X)
list_test = replace_normalized_with_original(normalized_col=normalized_col2_test, which_col=2, data=test_X)
list_test = replace_normalized_with_original(normalized_col=normalized_col3_test, which_col=3, data=test_X)
X_test_normalized = convert_list_to_array(list_test)
X_train_normalized = X_train_normalized.astype('float32')
X_test_normalized = X_test_normalized.astype('float32')
training_X = load_nsl_kdd_cup_dataset(os.path.join(dirs, 'KDDTrain+.txt'))
testing_X = load_nsl_kdd_cup_dataset(os.path.join(dirs, 'KDDTest+.txt'))
实现具有密集层的自动编码器网络
input_layer = Input(shape=(41,))
encoded = Dense(41, activation='relu')(input_layer)
encoded = Dense(20, activation='relu')(encoded)
encoded = Dense(10, activation='relu')(encoded)
decoded = Dense(10, activation='relu')(encoded)
decoded = Dense(20,activation='relu')(decoded)
decoded = Dense(41, activation='simoid')(decoded)
IDS_autoencoder = Model(inputs=input_layer, outputs=decoded)
IDS_autoencoder.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
IDS_autoencoder.summary()
IDS_autoencoder.fit(X_train_normalized, X_train_normalized,
epochs=5,
batch_size=128,
validation_data=(X_test_normalized,X_test_normalized))
这是我的输出:
Train on 125973 samples, validate on 22544 samples
Epoch 1/5
125973/125973 [==============================] - 3s 26us/step - loss: -25513.2260 - acc: 0.5736 - val_loss: -5095.4520 - val_acc: 0.6534
Epoch 2/5
125973/125973 [==============================] - 3s 23us/step - loss: -25675.6222 - acc: 0.6492 - val_loss: -5095.5203 - val_acc: 0.6512
Epoch 3/5
125973/125973 [==============================] - 3s 23us/step - loss: -25675.5876 - acc: 0.6483 - val_loss: -5095.5543 - val_acc: 0.6520
Epoch 4/5
125973/125973 [==============================] - 3s 23us/step - loss: -25675.6049 - acc: 0.6493 - val_loss: -5095.5665 - val_acc: 0.6526
Epoch 5/5
125973/125973 [==============================] - 3s 23us/step - loss: -25675.6039 - acc: 0.6477 - val_loss: -5095.5619 - val_acc: 0.6536