我正在执行数据归一化以将数据加载到神经网络中。为此,我需要从包含单词和数字的表中创建仅包含数字的表。
这里是the original data set view的一部分,或作为图像:
我已经可以删除列名,将空单元格更改为0,将“ YES”,“ Active”,“ NO”,“ Inactive”的单元格更改为1和0,see here或作为图像:
import pandas as pd
from pandas import DataFrame as df
import numpy as np
dataset = pd.read_csv('ex.csv', sep=';', header=None)
print("Original view of dataset: \n", dataset.loc[:, [3, 4, 6, 9, 10]])
# Deleting column names
dataset = dataset.drop(0, axis = 0)
# Changing empty cells to 0
dataset = dataset.fillna(value=0)
dataset.to_csv('ex_mod.csv', header=None, index=False)
dataset = pd.read_csv('ex_mod.csv', sep=',', header=None)
# Changing cells with 'YES', 'Active', 'NO', 'Inactive'
# Replacement function
def ifer(unit):
if unit == 'YES' or unit == 'Active':
return int(1)
if unit == 'NO' or unit == 'Inactive':
return int(0)
else:
return unit
raise ValueError('Undefined unit: {}'.format(unit))
# Replacement cycle
for i in dataset:
for j in dataset:
dataset[j][i] = ifer(dataset[j][i])
j+=1
if j > sum(1 for row in dataset)-1: break
j=0
i+=1
if i > len(dataset)-1: break
dataset.to_csv('ex_mod.csv', header=None, index=False)
dataset = pd.read_csv('ex_mod.csv', sep=',', header=None)
print("View dataset with changes cells with 'YES', 'Active', 'NO', 'Inactive': \n", dataset.loc[:, [0, 3, 4, 5, 6, 9, 10, 11]])