Python新手在这里,我正在寻找一种编写程序的简洁方法。我想读取多个csv文件并清理它们以排除异常值,然后再对这些列进行归一化,然后从归一化的列中创建一个组合数据集。输入的csv文件有很多列,我想对所有列进行规范化。在此代码中,我编写了一个两列的示例。
我编写的代码可以正常工作,但是繁琐而繁琐。我为3个数据集编写了代码。实际上,我可能会看到更多。关于如何循环播放并使其简洁的任何帮助?谢谢
import numpy as np
import pandas as pd
gr_P10 = 40
gr_P50 = 65
gr_P90 = 90
rt_P10 = 10
rt_P50 = 25
rt_P90 = 50
def get_quantiles(input_log):
p10_log = np.percentile(input_log, 10)
p50_log = np.percentile(input_log, 50)
p90_log = np.percentile(input_log, 90)
return p10_log, p50_log, p90_log
def normalize(input_log, x_90, x_50, x_10, p90_log, p50_log, p10_log):
mmin = (x_50-x_10)/(p50_log-p10_log)
mmax = (x_90-x_50)/(p90_log-p50_log)
if (input_log < p50_log ):
output_log = x_50 +(mmin*(input_log-p50_log))
else:
output_log = x_50 +(mmax*(input_log-p50_log))
return output_log
# Read data and removing outliers
#Data1
a = pd.read_csv('Data1.csv')
zscore = np.abs(stats.zscore(a))
a = a[(zscore < 3).all(axis=1)]
#Data2
b = pd.read_csv('Data2.csv')
zscore = np.abs(stats.zscore(b))
b = b[(zscore < 3).all(axis=1)]
#Data3
c = pd.read_csv('Data3.csv')
zscore = np.abs(stats.zscore(c))
c = c[(zscore < 3).all(axis=1)]
# Normalizing Data
# Normalizing Data1
p10_log, p50_log, p90_log = get_quantiles(a['GR'])
a['GR_NORM'] = a.apply(lambda x: normalize(x['GR'],gr_P90, gr_P50, gr_P10, p90_log, p50_log, p10_log ), axis =1)
p10_log, p50_log, p90_log = get_quantiles(a['RT'])
a['RT_NORM'] = a.apply(lambda x: normalize(x['RT'],rt_P90, rt_P50, rt_P10, p90_log, p50_log, p10_log ), axis =1)
# Normalizing Data2
p10_log, p50_log, p90_log = get_quantiles(b['GR'])
b['GR_NORM'] = b.apply(lambda x: normalize(x['GR'],gr_P90, gr_P50, gr_P10, p90_log, p50_log, p10_log ), axis =1)
p10_log, p50_log, p90_log = get_quantiles(b['RT'])
b['RT_NORM'] = b.apply(lambda x: normalize(x['RT'],rt_P90, rt_P50, rt_P10, p90_log, p50_log, p10_log ), axis =1)
# Normalizing Data3
p10_log, p50_log, p90_log = get_quantiles(c['GR'])
c['GR_NORM'] = c.apply(lambda x: normalize(x['GR'],gr_P90, gr_P50, gr_P10, p90_log, p50_log, p10_log ), axis =1)
p10_log, p50_log, p90_log = get_quantiles(c['RT'])
c['RT_NORM'] = c.apply(lambda x: normalize(x['RT'],rt_P90, rt_P50, rt_P10, p90_log, p50_log, p10_log ), axis =1)
# Forming new combined dataset with normalized values
new_a = a['GR_NORM','RT_NORM'].copy()
new_b = b['GR_NORM','RT_NORM'].copy()
new_c = c['GR_NORM','RT_NORM'].copy()
new_dataset = pd.concat([new_a,new_b, new_c], ignore_index= True)
答案 0 :(得分:0)
除非我忽略了某些内容,否则您可以为此编写一个函数。
答案 1 :(得分:0)
N_files = 3
for i in range(1, N_files):
a = pd.read_csv(f"Data{i}.csv") #this will loop through open all your files
答案 2 :(得分:0)
您只需要使用更多函数即可消除重复的代码。尝试用以下内容替换下半部分:
# Read data and removing outliers
#Data1
def read_data(data):
a = pd.read_csv(data)
zscore = np.abs(stats.zscore(a))
a = a[(zscore < 3).all(axis=1)]
#Normalizing Data
p10_log, p50_log, p90_log = get_quantiles(a['GR'])
a['GR_NORM'] = a.apply(lambda x: normalize(x['GR'],gr_P90, gr_P50, gr_P10, p90_log, p50_log, p10_log ), axis =1)
p10_log, p50_log, p90_log = get_quantiles(a['RT'])
a['RT_NORM'] = a.apply(lambda x: normalize(x['RT'],rt_P90, rt_P50, rt_P10, p90_log, p50_log, p10_log ), axis =1)
return a['GR_NORM','RT_NORM'].copy()
data = ['Data1.csv','Data2.csv','Data3.csv']
new_dataset = pd.DataFrame()
for x in data:
new_dataset = new_dataset.append(read_data(x))