我必须对 some .csv文件进行预处理。这些.csv文件是TIMIT数据集中的音频特征矩阵。基本上,它们是#samples * 123个特征的矩阵。 我想在样本上做一个滑动窗口。
我写了这个课:
import glob
import pandas as pd
import numpy as np
from math import floor
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import time
import datetime
import progressbar
class MyDataGenerator:
def __init__(self, path):
self.__path = path
self.__path_train = path[0]
self.__path_test = path[1]
self.__path_validation = path[2]
def generate_overlapping_chunks(self, timesteps, compact = True):
print("reading train:")
data_train = self.generate_data_frame(self.__path_train)
print("reading test:")
data_test = self.generate_data_frame(self.__path_test)
print("reading validation:")
data_validation = self.generate_data_frame(self.__path_validation)
if compact:
data_train = self.compact_class(data_train)
data_test = self.compact_class(data_test)
data_validation = self.compact_class(data_validation)
train_n, test_n, validation_n = self.min_max_scale_skl(data_train, data_test, data_validation)
print("train:")
train_data, train_label = self.generate_chunks(data_train, train_n, timesteps)
print("test:")
test_data, test_label = self.generate_chunks(data_test, test_n, timesteps)
print("validation:")
validation_data, validation_label = self.generate_chunks(data_validation, validation_n, timesteps)
train_label, test_label, validation_label = self.encode_label(train_label, test_label, validation_label)
return train_data, train_label, test_data, test_label, validation_data, validation_label
def compact_class(self, data_file):
data_file.loc[data_file['phoneme'] == 'ux', 'phoneme'] = 'uw'
data_file.loc[data_file['phoneme'] == 'axr', 'phoneme'] = 'er'
data_file.loc[data_file['phoneme'] == 'em', 'phoneme'] = 'm'
data_file.loc[data_file['phoneme'] == 'nx', 'phoneme'] = 'n'
data_file.loc[data_file['phoneme'] == 'eng', 'phoneme'] = 'ng'
data_file.loc[data_file['phoneme'] == 'hv', 'phoneme'] = 'hh'
data_file.loc[data_file['phoneme'] == 'h#', 'phoneme'] = 'sil'
data_file.loc[data_file['phoneme'] == 'pau', 'phoneme'] = 'sil'
data_file.loc[data_file['phoneme'] == 'pcl', 'phoneme'] = 'sil'
data_file.loc[data_file['phoneme'] == 'tcl', 'phoneme'] = 'sil'
data_file.loc[data_file['phoneme'] == 'kcl', 'phoneme'] = 'sil'
data_file.loc[data_file['phoneme'] == 'bcl', 'phoneme'] = 'sil'
data_file.loc[data_file['phoneme'] == 'dcl', 'phoneme'] = 'sil'
data_file.loc[data_file['phoneme'] == 'gcl', 'phoneme'] = 'sil'
data_file.loc[data_file['phoneme'] == 'epi', 'phoneme'] = 'sil'
data_file.loc[data_file['phoneme'] == 'zh', 'phoneme'] = 'sh'
data_file.loc[data_file['phoneme'] == 'en', 'phoneme'] = 'n'
data_file.loc[data_file['phoneme'] == 'el', 'phoneme'] = 'l'
data_file.loc[data_file['phoneme'] == 'ix', 'phoneme'] = 'ih'
data_file.loc[data_file['phoneme'] == 'ax', 'phoneme'] = 'ah'
data_file.loc[data_file['phoneme'] == 'ax-h', 'phoneme'] = 'ah'
data_file.loc[data_file['phoneme'] == 'ao', 'phoneme'] = 'aa'
return data_file
def generate_data_frame(self, path):
data = pd.DataFrame()
tot = len(glob.glob(path))
bar = progressbar.ProgressBar(maxval=tot, widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
i = 0
bar.start()
for file_name in glob.iglob(path):
data_file = pd.read_csv(file_name)
data = pd.concat((data, data_file))
i = i+1
bar.update(i)
bar.finish()
data = data.rename(columns={'Unnamed: 0': 'frame'})
return data
def min_max_scale_skl(self, train, test, validation):
scaler = MinMaxScaler(feature_range=(-1, 1))
scaler = scaler.fit(np.concatenate((train.iloc[:, 1:124], test.iloc[:, 1:124], validation.iloc[:, 1:124])))
return scaler.transform(train.iloc[:, 1:124]), scaler.transform(test.iloc[:, 1:124]), scaler.transform(validation.iloc[:, 1:124])
def generate_chunks(self, data, data_norm, timesteps):
label = np.empty(0)
data_np = np.empty((1, timesteps, 123))
b = range(timesteps, data.shape[0]+1)
bar = progressbar.ProgressBar(maxval=data.shape[0]-timesteps, widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
bar.start()
for i in range(0, data.shape[0]-timesteps+1):
c = ((data_norm[i:b[i]])).reshape(1, timesteps, (124-1))
data_np = np.concatenate((data_np, c))
label = np.concatenate((label, [data.iloc[i+floor(timesteps/2)]['phoneme']]))
bar.update(i)
bar.finish()
return data_np[1:], label
def encode_label(self, train, test, val):
encoder = LabelEncoder()
encoder.fit(
np.concatenate(
(train, np.concatenate((test, val)))
)
)
train_encoded_labels = encoder.transform(train)
test_encoded_labels = encoder.transform(test)
val_encoded_labels = encoder.transform(val)
return to_categorical(train_encoded_labels), to_categorical(test_encoded_labels), to_categorical(val_encoded_labels)
我注意到
generate_chunks(self, data, data_norm, timesteps)
非常慢。上次执行使我在Intel Xeon E5-1620 v3上花费了40多个小时。我使用随Anaconda安装的Python 3.6.8。 有什么想法可以提高这个糟糕的代码吗?
答案 0 :(得分:1)
尝试将记录分成较小的块,然后并行处理它们。 这是一个带有简单示例的精彩讨论: How to use threading in Python?
此外,还有一个使用cython的选项(Python + C(大捷径))。这对于大循环等很有帮助。
答案 1 :(得分:1)
data_np = np.concatenate((data_np, c))
label = np.concatenate((label, [data.iloc[i+floor(timesteps/2)]['phoneme']]))
这些都是昂贵的操作,您需要做很多。
def generate_chunks(self, data, data_norm, timesteps):
label = []
data_np = []
b = range(timesteps, data.shape[0]+1)
for i in range(0, data.shape[0] - timesteps + 1):
data_np.append(data_norm[i:b[i]].reshape(1, timesteps, (124 - 1)))
label.append(data.iloc[i + floor(timesteps / 2)]['phoneme'])
data_np = np.concatenate(data_np)
labels = np.concatenate(label)
return data_np, labels
这样的事情将至少快一个数量级,而不会改变内存使用率。其他改进也将有所帮助(并且,如果您有兴趣对其进行改进,则应考虑对代码进行概要分析),但这将是一个很大的改进。