Question

我正在尝试在单独的GPU上训练多个CNN，例如每个GPU一个网络。因为我希望所有网络都同时进行训练，所以我使用multiprocessing来同时开始训练。

我已经可以使用以下给定的代码运行它，但是，程序的行为不是确定性的。有时，它会抛出错误'不能腌制SwigpyObject'或'不能腌制_thread.rlock对象'。另外，有时它会卡在 queue.get（）或 self.map_async（func，iterable，chunksize）.get（）上。我查了一下，发现有人建议使用pathos。我尝试过，但是即使那样，行为也无法重现。

我使用tensorflow的数据集和估计器API。另外，我使用多处理管理器中的Queue和Dict（也尝试过多进程）来跟踪可用的GPU，它们在进程之间共享。

这是我的CNN代码

import numpy as np
import os
import tensorflow as tf
import sys

class DNN():

    def __init__(self,epochs = 5, batch_size = 64, learning_rate = 0.001, data_dir = None, verbose = False):
        self._epochs = epochs
        self._batch_size = batch_size
        self._learning_rate = learning_rate
        self._data_dir = data_dir 

        if verbose:
            tf.logging.set_verbosity(tf.logging.INFO)
        else:
            tf.logging.set_verbosity(tf.logging.ERROR)

        self._image_shape = [300,300,3]

    def _build_net(self,inp):
        with tf.name_scope('net'): 
            x = tf.layers.conv2d(inp, 3, 8, padding = 'same', name = 'conv1')
            x = tf.layers.max_pooling2d(x, 3, strides = 2, name = 'pool1')
            x = tf.layers.conv2d(x, 5, 8, padding = 'same', name = 'conv2')
            x = tf.layers.max_pooling2d(x, 3, strides = 2, name = 'pool2')
            x = tf.layers.conv2d(x, 5, 16, padding = 'same', name = 'conv3')
            x = tf.layers.max_pooling2d(x, 3, strides = 2, name = 'pool3')
            x = tf.layers.flatten(x) 
            x = tf.layers.dense(x, 50, name = 'dense1')
            x = tf.layers.dense(x, self.number_of_classes, name='output')
        return x

    def _loss_fn(self,logits,labels,mode):
        if mode  == tf.estimator.ModeKeys.TRAIN:
            loss_name = 'train_loss'
        elif mode  == tf.estimator.ModeKeys.EVAL:
            loss_name = 'eval_loss'
        with tf.name_scope(loss_name):
            loss = tf.reduce_mean(tf.losses.softmax_cross_entropy(labels,logits))
        return loss

    def _opt_fn(self,loss):
        with tf.name_scope('optimizer'):
            opt = tf.train.AdamOptimizer(learning_rate=self._learning_rate).minimize(loss,global_step=tf.train.get_global_step())
        return opt

    def _acc(self,predictions,labels):
        with tf.name_scope('acc'):
            acc = tf.reduce_mean(tf.cast(tf.equal(predictions, tf.argmax(labels,-1)), tf.float32))
        return acc

    def _inp_fn(self,dataset,mode):

        def input_parser(img_path, label):

            # convert the label to one-hot encoding
            one_hot = tf.one_hot(label, self.number_of_classes)
            one_hot = tf.reshape(one_hot,[self.number_of_classes])

            # read the img from file
            img_file = tf.read_file(img_path)
            img_decoded = tf.image.decode_jpeg(img_file, channels=self._image_shape[-1])
            img_decoded = tf.image.resize_images(img_decoded,self._image_shape[:2])
            img_decoded = tf.cast(img_decoded, tf.float32)
            img_decoded = tf.reshape(img_decoded,self._image_shape)
            return img_decoded, one_hot

        dataset = tf.data.Dataset.from_tensor_slices(dataset)

        if mode  == tf.estimator.ModeKeys.TRAIN:
            dataset = dataset.apply(tf.contrib.data.shuffle_and_repeat(self._batch_size,count = self._epochs, seed = tf.set_random_seed(123)))
            dataset = dataset.apply(tf.contrib.data.map_and_batch(map_func = input_parser, batch_size = self._batch_size, num_parallel_batches = 4))
            dataset = dataset.prefetch(1)
            iterator = tf.data.Iterator.from_structure(dataset.output_types,dataset.output_shapes)

        elif mode  == tf.estimator.ModeKeys.PREDICT or tf.estimator.ModeKeys.EVAL:
            dataset = dataset.apply(tf.contrib.data.shuffle_and_repeat(self._batch_size,count = 1, seed = tf.set_random_seed(123)))
            dataset = dataset.apply(tf.contrib.data.map_and_batch(map_func = input_parser, batch_size = self._batch_size, num_parallel_batches = 4))
            dataset = dataset.prefetch(1)

        iterator = dataset.make_one_shot_iterator()
        return iterator.get_next()

    def _model_fn(self,features,labels,mode,params):
        logits = self._build_net(features)
        loss = self._loss_fn(logits,labels,mode)
        predictions = {"classes": tf.argmax(input = tf.nn.softmax(logits),axis = -1),
                       "probabilities": tf.nn.softmax(logits,name = "softmax")
                        }
        acc_op = self._acc(predictions = predictions["classes"], labels = labels)    
        if mode  == tf.estimator.ModeKeys.TRAIN:
            opt = self._opt_fn(loss)
            tf.summary.scalar('train_accuracy', acc_op)
            return tf.estimator.EstimatorSpec(mode = mode, loss = loss, train_op = opt)

        elif mode  == tf.estimator.ModeKeys.PREDICT:
            return tf.estimator.EstimatorSpec(mode = mode, predictions = predictions)

        elif mode  == tf.estimator.ModeKeys.EVAL:
            eval_acc = tf.metrics.accuracy(labels=tf.argmax(labels,-1),predictions=predictions["classes"])
            eval_metric_ops = {"val_accuracy": eval_acc}
            tf.summary.scalar('val_accuracy',acc_op)
            return tf.estimator.EstimatorSpec(mode = mode, loss=loss, eval_metric_ops=eval_metric_ops)

    def train(self, train_data, val_data, queue, dict):
        x_train, y_train = train_data
        x_eval, y_eval = val_data

        run_config = tf.estimator.RunConfig(model_dir = os.path.join('logs','model_basic_cnn'), tf_random_seed = tf.set_random_seed(123), keep_checkpoint_max = 1, save_checkpoints_steps = 1000, save_checkpoints_secs = None)
        self.clf = tf.estimator.Estimator(model_fn = self._model_fn, config=run_config)

        if not os.path.isdir(self.clf.eval_dir()):
            os.makedirs(self.clf.eval_dir())

        gpu_id = queue.get()
        os.environ('CUDA_VISIBLE_DEVICES') = gpu_id
        with tf.device('\gpu:'+ gpu_id):
            out = tf.estimator.train_and_evaluate(
                self.clf,
                train_spec = tf.estimator.TrainSpec(input_fn = lambda: self._inp_fn((x_train,y_train),mode=tf.estimator.ModeKeys.TRAIN)),
                eval_spec = tf.estimator.EvalSpec(input_fn = lambda: self._inp_fn((x_eval,y_eval),mode=tf.estimator.ModeKeys.EVAL))
                )
        print(f"Validation accuracy: {out[0]['accuracy']:.4f}")
        queue.put(gpu_id)
        return out[0]['accuracy']

调用多处理的代码如下：

#from multiprocessing import Pool, Manager
from pathos.multiprocessing import ProcessPool as Pool
from multiprocess import Manager
import dill

def train_pp(dnn, dict, man, pool):
    queue = man.Queue(dnn.NUM_GPUS)
    [queue.put(i) for i in range(dnn.NUM_GPUS)]
    all_acc = pool.map(dnn.train,[[(X_train,y_train),(X_eval,y_eval),queue,dict] for args])
    #queue.join()
    pool.close()
    pool.join()
    pool.clear()
    pool.restart()
    return all_acc, dict

if __name__ == "__main__":
    dnn = DNN()   
    pool = Pool(processes=dnn.NUM_GPUS)
    man = Manager()
    dict = man.dict()
    net_acc, dict = train_pp(dnn,dict,man,pool)

同时训练多个网络的最佳方法是什么？多处理显示随机行为的问题是什么？如何解决？

PS：还有一个额外的，与主题无关的问题：当所有网络都接受相同数据的训练时，如何完成数据共享？目前，我相信多处理正在制作DNN代码的多个副本，因此每个GPU都有自己的迭代器。这是最好的方法吗？如何在CPU上拥有数据并与所有进程共享而没有互锁？

使用张量流和多处理在多GPU系统上训练多个网络

0 个答案: