如何在Tensorflow 1.13.1中提高tf.data.experimental.CsvDataset的速度?

时间:2019-06-22 09:45:34

标签: tensorflow pytorch tensorflow-datasets

我正在为Tensorflow中的csv数据编写一个玩具示例。 我在tensorflow和pytorch中实现了三种类型的数据加载器,以比较它们的速度。这是代码:

首先,使用tensorflow api tf.data.experimental.CsvDataset

def parse_data(x, n_classes):
    x = tf.convert_to_tensor(x)
    return x[:-1], tf.one_hot(indices=tf.cast(x[-1], tf.int32), depth=n_classes)

if __name__=='__main__':
    dataset_train = tf.data.experimental.CsvDataset('/home/david/Dataset/timit/test.csv', [tf.float32] * 430,
                                                    header=False,
                                                    field_delim=' ')
    dataset_train = dataset_train.map(lambda *x_: parse_data(x_, 1928))
    dataset_train = dataset_train.batch(128)
    dataset_train = dataset_train.prefetch(1)
    iterator = dataset_train.make_initializable_iterator()

    x_in, y = iterator.get_next()

    x = tf.layers.Dense(units=1024, activation=tf.nn.relu)(x_in)
    x = tf.layers.Dense(units=1024, activation=tf.nn.relu)(x)
    x = tf.layers.Dense(units=1024, activation=tf.nn.relu)(x)
    x = tf.layers.Dense(units=1024, activation=tf.nn.relu)(x)
    logits = tf.layers.Dense(units=1928, activation=None)(x)

    loss = tf.losses.softmax_cross_entropy(y, logits)
    optimizer = tf.train.AdamOptimizer()
    optimizer.minimize(loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sess.run(iterator.initializer)
    running_loss = 0.0
    time_last = time.time()
    epoch = 0
    i = 0
    while True:
        try:
            running_loss += sess.run(loss)  # , feed_dict={x: data, y: labels})
            if (i + 1) % 5 == 0:
                print('\r[epoch: %2d, batch: %5d, time: %5f] loss: %.3f' % (
                    epoch + 1, i + 1, time.time() - time_last, running_loss / i), end=' ')
                time_last = time.time()
            i += 1
        except tf.errors.OutOfRangeError:
            pass

第二,使用pandastf.placeholder

if __name__ == '__main__':
    x_in = tf.placeholder(shape=[128, 429], dtype=tf.float32)
    y_in = tf.placeholder(shape=[128], dtype=tf.int32)
    y = tf.one_hot(y_in, depth=1928)

    x = tf.layers.Dense(units=1024, activation=tf.nn.relu)(x_in)
    x = tf.layers.Dense(units=1024, activation=tf.nn.relu)(x)
    x = tf.layers.Dense(units=1024, activation=tf.nn.relu)(x)
    x = tf.layers.Dense(units=1024, activation=tf.nn.relu)(x)
    logits = tf.layers.Dense(units=1928, activation=None)(x)

    loss = tf.losses.softmax_cross_entropy(y, logits)
    optimizer = tf.train.AdamOptimizer()
    optimizer.minimize(loss)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    w = pd.read_csv('/home/david/Dataset/timit/test.csv', header=None, delim_whitespace=True).values

    for epoch in range(23):
        running_loss = 0.0
        time_last = time.time()
        i = 0
        indexes = np.random.permutation(w.shape[0])
        w_ = w[indexes, :]
        while True:
            if i * 128 + 128 > w.shape[0]:
                break
            running_loss += sess.run(loss,
                                     feed_dict={x_in: w_[i * 128:i * 128 + 128, :-1],
                                                y_in: w_[i * 128:i * 128 + 128, -1]})
            if (i + 1) % 5 == 0:
                print('\r[epoch: %2d, batch: %5d, time: %5f] loss: %.3f' % (
                    epoch + 1, i + 1, time.time() - time_last, running_loss / i), end=' ')
                time_last = time.time()
            i += 1

第三,带有pytorchpandas

class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()

        self.fc1 = nn.Linear(429, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 1024)
        self.fc4 = nn.Linear(1024, 1024)
        self.fc5 = nn.Linear(1024, 1928)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = self.fc5(x)
        return x


class CsvDataset(data.Dataset):
    """Face Landmarks dataset."""

    def __init__(self, csv_file):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.landmarks_frame = pd.read_csv(csv_file, header=None, delim_whitespace=True)

    def __len__(self):
        return len(self.landmarks_frame)

    def __getitem__(self, idx):
        landmarks = self.landmarks_frame.values[idx, :]
        return landmarks[:-1], landmarks[-1]


if __name__ == '__main__':
    net = Net()
    device = torch.device('cuda:0')
    print(device)
    net.to(device)

    optimizer = optim.Adam(net.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    dataset = CsvDataset('/home/david/Dataset/timit/train.csv')
    trainloader = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=True)

    for epoch in range(23):

        running_loss = 0.0
        time_last = time.time()
        for i, data in enumerate(trainloader):
            inputs, labels = data
            inputs = inputs.float().to(device)
            labels = labels.long().to(device)

            optimizer.zero_grad()

            output = net(inputs)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            if (i + 1) % 5 == 0:
                print('\r[epoch: %2d, batch: %5d, time: %5f] loss: %.3f' % (
                    epoch + 1, i + 1, time.time() - time_last, running_loss / i), end=' ')
                time_last = time.time()
        print('')
    print('Finished Training')

我记录了五批培训的时间成本:

  • 首先,CsvDataset:1.382647s
  • 第二个,占位符:0.013263s
  • 第三支火炬:0.042086s

我猜这部分是因为tf.data.experimental.CsvDataset在每个批处理之前都要先执行io操作,以从csv文件中提取数据(这是原因还是其他原因?)。

但是,与其他两个相比,它太慢了。有改善的机会吗?如何设置tf.data.experimental.CsvDataset api从一开始就加载所有csv数据?

还是可以说tf.data.experimental.CsvDataset仅针对太大而无法存储在内存中的csv数据集实现?因为时间成本似乎无法忍受。

1 个答案:

答案 0 :(得分:0)

在第一个示例中,您可能会使用批处理大小,如果每次将文件中的批处理都读取一次,如果将其增大2倍,就可以证明这一点,那么您可能希望将速度提高2倍。我没有在TF中玩(实验)类CsvDataset

我确信Pandas可以更快地阅读您的文档,这就是您拥有这些时间的部分原因。

可能在下一步中,您应取消设置损失函数nn.CrossEntropyLoss()。通过最后的浮点标签判断,最有可能出现回归问题,而不是分类问题。

因此,请尝试使用torch.nn.MSELoss作为损失函数。