我正在为Tensorflow中的csv数据编写一个玩具示例。 我在tensorflow和pytorch中实现了三种类型的数据加载器,以比较它们的速度。这是代码:
首先,使用tensorflow api tf.data.experimental.CsvDataset
:
def parse_data(x, n_classes):
x = tf.convert_to_tensor(x)
return x[:-1], tf.one_hot(indices=tf.cast(x[-1], tf.int32), depth=n_classes)
if __name__=='__main__':
dataset_train = tf.data.experimental.CsvDataset('/home/david/Dataset/timit/test.csv', [tf.float32] * 430,
header=False,
field_delim=' ')
dataset_train = dataset_train.map(lambda *x_: parse_data(x_, 1928))
dataset_train = dataset_train.batch(128)
dataset_train = dataset_train.prefetch(1)
iterator = dataset_train.make_initializable_iterator()
x_in, y = iterator.get_next()
x = tf.layers.Dense(units=1024, activation=tf.nn.relu)(x_in)
x = tf.layers.Dense(units=1024, activation=tf.nn.relu)(x)
x = tf.layers.Dense(units=1024, activation=tf.nn.relu)(x)
x = tf.layers.Dense(units=1024, activation=tf.nn.relu)(x)
logits = tf.layers.Dense(units=1928, activation=None)(x)
loss = tf.losses.softmax_cross_entropy(y, logits)
optimizer = tf.train.AdamOptimizer()
optimizer.minimize(loss)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
sess.run(iterator.initializer)
running_loss = 0.0
time_last = time.time()
epoch = 0
i = 0
while True:
try:
running_loss += sess.run(loss) # , feed_dict={x: data, y: labels})
if (i + 1) % 5 == 0:
print('\r[epoch: %2d, batch: %5d, time: %5f] loss: %.3f' % (
epoch + 1, i + 1, time.time() - time_last, running_loss / i), end=' ')
time_last = time.time()
i += 1
except tf.errors.OutOfRangeError:
pass
第二,使用pandas
和tf.placeholder
:
if __name__ == '__main__':
x_in = tf.placeholder(shape=[128, 429], dtype=tf.float32)
y_in = tf.placeholder(shape=[128], dtype=tf.int32)
y = tf.one_hot(y_in, depth=1928)
x = tf.layers.Dense(units=1024, activation=tf.nn.relu)(x_in)
x = tf.layers.Dense(units=1024, activation=tf.nn.relu)(x)
x = tf.layers.Dense(units=1024, activation=tf.nn.relu)(x)
x = tf.layers.Dense(units=1024, activation=tf.nn.relu)(x)
logits = tf.layers.Dense(units=1928, activation=None)(x)
loss = tf.losses.softmax_cross_entropy(y, logits)
optimizer = tf.train.AdamOptimizer()
optimizer.minimize(loss)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
w = pd.read_csv('/home/david/Dataset/timit/test.csv', header=None, delim_whitespace=True).values
for epoch in range(23):
running_loss = 0.0
time_last = time.time()
i = 0
indexes = np.random.permutation(w.shape[0])
w_ = w[indexes, :]
while True:
if i * 128 + 128 > w.shape[0]:
break
running_loss += sess.run(loss,
feed_dict={x_in: w_[i * 128:i * 128 + 128, :-1],
y_in: w_[i * 128:i * 128 + 128, -1]})
if (i + 1) % 5 == 0:
print('\r[epoch: %2d, batch: %5d, time: %5f] loss: %.3f' % (
epoch + 1, i + 1, time.time() - time_last, running_loss / i), end=' ')
time_last = time.time()
i += 1
第三,带有pytorch
和pandas
:
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 = nn.Linear(429, 1024)
self.fc2 = nn.Linear(1024, 1024)
self.fc3 = nn.Linear(1024, 1024)
self.fc4 = nn.Linear(1024, 1024)
self.fc5 = nn.Linear(1024, 1928)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = F.relu(self.fc4(x))
x = self.fc5(x)
return x
class CsvDataset(data.Dataset):
"""Face Landmarks dataset."""
def __init__(self, csv_file):
"""
Args:
csv_file (string): Path to the csv file with annotations.
root_dir (string): Directory with all the images.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.landmarks_frame = pd.read_csv(csv_file, header=None, delim_whitespace=True)
def __len__(self):
return len(self.landmarks_frame)
def __getitem__(self, idx):
landmarks = self.landmarks_frame.values[idx, :]
return landmarks[:-1], landmarks[-1]
if __name__ == '__main__':
net = Net()
device = torch.device('cuda:0')
print(device)
net.to(device)
optimizer = optim.Adam(net.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
dataset = CsvDataset('/home/david/Dataset/timit/train.csv')
trainloader = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=True)
for epoch in range(23):
running_loss = 0.0
time_last = time.time()
for i, data in enumerate(trainloader):
inputs, labels = data
inputs = inputs.float().to(device)
labels = labels.long().to(device)
optimizer.zero_grad()
output = net(inputs)
loss = criterion(output, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
if (i + 1) % 5 == 0:
print('\r[epoch: %2d, batch: %5d, time: %5f] loss: %.3f' % (
epoch + 1, i + 1, time.time() - time_last, running_loss / i), end=' ')
time_last = time.time()
print('')
print('Finished Training')
我记录了五批培训的时间成本:
我猜这部分是因为tf.data.experimental.CsvDataset
在每个批处理之前都要先执行io操作,以从csv文件中提取数据(这是原因还是其他原因?)。
但是,与其他两个相比,它太慢了。有改善的机会吗?如何设置tf.data.experimental.CsvDataset
api从一开始就加载所有csv数据?
还是可以说tf.data.experimental.CsvDataset
仅针对太大而无法存储在内存中的csv数据集实现?因为时间成本似乎无法忍受。
答案 0 :(得分:0)
在第一个示例中,您可能会使用批处理大小,如果每次将文件中的批处理都读取一次,如果将其增大2倍,就可以证明这一点,那么您可能希望将速度提高2倍。我没有在TF中玩(实验)类CsvDataset
。
我确信Pandas可以更快地阅读您的文档,这就是您拥有这些时间的部分原因。
可能在下一步中,您应取消设置损失函数nn.CrossEntropyLoss()
。通过最后的浮点标签判断,最有可能出现回归问题,而不是分类问题。
因此,请尝试使用torch.nn.MSELoss
作为损失函数。