不工作或不了解tf.contrib.distribute.MirroredStrategy()

时间:2018-07-13 17:10:51

标签: python tensorflow

我的代码有问题。只是尝试使用MirroredStrategy功能,它对我不起作用,而且我只有一些断言错误;

0)我是否编写了自定义代码:

import math
import os
import time
import random
import numpy as np
import pickle
import errno
import tensorflow as tf


TOTAL_UTTERANCIES_COUNT = 902113
PREFETCH_SIZE = 20
N_for_V_and_S = 64
DATASETS_REPEAT_COUNT = 9999999999
BATCH_SIZE = 2000

GPU_NUM = 2

all_pdf_ids_set_filepath = "./pdf_ids_set_TOTAL.pckl"


def create_or_check_path(path):
    try:
        os.makedirs(path)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise


def loadPickleData(file):
    try:
        with open(file, "rb") as f:
            data = pickle.load(f)
        return data
    except Exception as e:
        print("loadPickleData():")
        print("EXCEPTION:", e)
        print("file: ", file)
        exit()


def saveDataAsPickle(file, data):
    create_or_check_path(os.path.dirname(file))
    with open(file, "wb") as f:
        pickle.dump(data, f)


def avg_mfcc_path_with_idx(idx):
    avg_mfcc_pickle_PATH = "./raw_mfcc_train.%d.ark.avg_mfcc.pckl" % idx
    return avg_mfcc_pickle_PATH


def renew_data_to_idx(idx):
    data_from_pickle = loadPickleData(avg_mfcc_path_with_idx(idx))
    sorted_data_utt_names = sorted(list(data_from_pickle.keys()))
    return data_from_pickle, sorted_data_utt_names

full_data_loaded_part, full_data_loaded_part_sorted_data_utt_names = renew_data_to_idx(1)

pdf_ids_set = loadPickleData(all_pdf_ids_set_filepath)
sorted_pdf_ids_list = sorted(list(pdf_ids_set))
TOTAL_PDF_IDS_COUNT = len(pdf_ids_set)


dict_pdf_ids_to_V_num = {}
dict_V_num_to_pdf_ids = {}

for i in range(len(sorted_pdf_ids_list)):
    dict_pdf_ids_to_V_num.update({sorted_pdf_ids_list[i]: i})
    dict_V_num_to_pdf_ids.update({i: sorted_pdf_ids_list[i]})


random_loaded_utt_name = random.choice(full_data_loaded_part_sorted_data_utt_names)
MFCC_SIZE = len(full_data_loaded_part[random_loaded_utt_name][list(full_data_loaded_part[random_loaded_utt_name].keys())[0]])


all_data_from_pickle = {}
all_sorted_data_utt_names = []
# for data_num in range(1, 49):
for data_num in range(1, 2):
    data_from_pickle, sorted_data_utt_names = renew_data_to_idx(data_num)
    all_data_from_pickle.update(data_from_pickle)
    all_sorted_data_utt_names.extend(sorted_data_utt_names)
    print("Data #%d loaded" % data_num, flush=True)
print("Data loading complete", flush=True)


def full_data_vectors_generator():
    for idx_in_S in range(len(all_sorted_data_utt_names)):
        output_U_data = np.zeros([TOTAL_PDF_IDS_COUNT, MFCC_SIZE], dtype=np.float32)
        output_M_data = np.zeros([TOTAL_PDF_IDS_COUNT, MFCC_SIZE], dtype=np.float32)
        utt_data = all_data_from_pickle[all_sorted_data_utt_names[idx_in_S]]
        for pdf_id in list(utt_data.keys()):
            """ 
            pdf_id here is 'str'. for example: '1694'
            """
            idx_in_output_data = dict_pdf_ids_to_V_num[pdf_id]
            output_U_data[idx_in_output_data] += utt_data[pdf_id]
            output_M_data[idx_in_output_data] += 1 #mark vector
        """
        yield S[i], U_data_vector, M_data_vector 
        """
        # print("DEBUG_counter:", tf.train.get_global_step(), flush=True)
        yield ((np.reshape(output_U_data, [TOTAL_PDF_IDS_COUNT*MFCC_SIZE]), np.reshape(output_M_data, [TOTAL_PDF_IDS_COUNT*MFCC_SIZE])), idx_in_S)


def make_full_data_dataset():

    full_data_dataset = tf.data.Dataset.from_generator(generator=full_data_vectors_generator,
                                                       output_types=((tf.float32, tf.float32), tf.int32),
                                                       output_shapes=((tf.TensorShape([None]), tf.TensorShape([None])),
                                                                      tf.TensorShape(None)))

    full_data_dataset = full_data_dataset.repeat(count=DATASETS_REPEAT_COUNT)
    # full_data_dataset = full_data_dataset.shuffle(buffer_size=1000) #
    full_data_dataset = full_data_dataset.batch(batch_size=BATCH_SIZE)

    #TEST PREFETCH TO DEVICE
    # transformation_function = tf.contrib.data.prefetch_to_device(device="/device:GPU:0", buffer_size=10)
    # full_data_dataset = full_data_dataset.apply(transformation_func=transformation_function)

    full_data_dataset = full_data_dataset.prefetch(buffer_size=PREFETCH_SIZE)

    return full_data_dataset

def model_fn(features, labels, mode, params):
    """
    Args:

    features: This is the x-arg from the input_fn.
    labels:   This is the y-arg from the input_fn,
              see e.g. train_input_fn for these two.
    mode:     Either TRAIN, EVAL, or PREDICT
    params:   User-defined hyper-parameters, e.g. learning-rate.
    """
    U_data = features[0]
    M_data = features[1]
    S_idxs = labels

    V = tf.Variable(tf.random_normal([TOTAL_PDF_IDS_COUNT, MFCC_SIZE, N_for_V_and_S], stddev=100.000001, mean=0.3), name="V")
    S = tf.Variable(tf.random_normal([TOTAL_UTTERANCIES_COUNT, N_for_V_and_S], stddev=100.000001, mean=0.3), name="S")

    if mode == tf.estimator.ModeKeys.PREDICT:
        spec = tf.estimator.EstimatorSpec(mode=mode,
                                          predictions=1)

    else:
        reshaped_V = tf.reshape(V, [TOTAL_PDF_IDS_COUNT*MFCC_SIZE, N_for_V_and_S])
        loss_op = tf.reduce_sum(tf.square(tf.matmul(tf.gather(S, S_idxs), reshaped_V, transpose_b=True)-U_data)*M_data)

        optimizer = tf.train.GradientDescentOptimizer(0.00000001)
        print("TRAINABLE_VARIABLES:", tf.trainable_variables())
        print("TRAINABLE_VARIABLES[:1]:", tf.trainable_variables()[:1])
        print("TRAINABLE_VARIABLES[1:]:", tf.trainable_variables()[1:])
        train_op = optimizer.minimize(loss=loss_op, global_step=tf.train.get_global_step(), var_list=tf.trainable_variables())


        logging_hook = tf.train.LoggingTensorHook({"loss_custom_log_hook": loss_op}, every_n_iter=1)

        spec = tf.estimator.EstimatorSpec(
            mode=mode,
            loss=loss_op,
            train_op=train_op,
            training_hooks=[logging_hook])

    return spec


def make_var_dict_for_estimator(estmtr):
    estim_model_var_names = estmtr.get_variable_names()
    var_values_dict = {}
    for var in estim_model_var_names:
        var_values_dict[var] = model.get_variable_value(var)
    return var_values_dict


tf.logging.set_verbosity(tf.logging.INFO)


if GPU_NUM > 1:
    distribution = tf.contrib.distribute.MirroredStrategy(num_gpus=GPU_NUM)
elif GPU_NUM == 1:
    distribution = None
else:
    print("ERROR: WRONG GPU_NUM = %d" % GPU_NUM)
    exit()

config = tf.estimator.RunConfig(train_distribute=distribution)
model = tf.estimator.Estimator(model_fn=model_fn,
                               params=[],
                               model_dir="./checkpoints_train/",
                               config=config)


model.train(input_fn=make_full_data_dataset, steps=300000)


"""
model.evaluate() will show some outputs, like 'loss' at final step and 'global_step':
for example: {'loss': 4.1929689e+12, 'global_step': 15} 
"""
model_evaluate_result = model.evaluate(input_fn=make_full_data_dataset, steps=3)
print("Evaluate result", model_evaluate_result)

1)操作系统平台和发行版: -Linux UbuntuPC 4.4.0-130-通用#156-Ubuntu SMP周四6月14日08:53:28 UTC 2018 x86_64 x86_64 x86_64 GNU / Linux

2)从以下位置安装TensorFlow: sudo pip3.5安装***

3)TensorFlow版本:
检查了tf的两个版本:
     a)1.10.nightly
     tf_nightly_gpu-1.10.0.dev20180620-cp35-cp35m-manylinux1_x86_64.whl
     b)1.9.rc2
     tensorflow_gpu-1.9.0rc2-cp35-cp35m-manylinux1_x86_64.whl

4)Bazel版本:

  

提取Bazel安装...
  构建标签:0.13.1
  构建目标:bazel-out / k8-opt / bin / src / main / java / com / google / devtools / build / lib / bazel / BazelServer_deploy.jar
  建立时间:2018年5月23日星期三11:17:23(1527074243)
  建立时间戳记:1527074243
  构建时间戳为int:1527074243

5)CUDA / cuDNN版本
CUDA-9.0.176.2
cuDNN-7.1.2

6)GPU模型和内存:
Nvidia GeForce GTX 1080 8Gb

7)精确复制命令:

  

$ python3.5 my_tf_script.py

当我仅使用1个GPU(第17行的GPU_NUM = 1)时,它可以工作。
但是当我使用“魔术词”-
distribution = tf.contrib.distribute.MirroredStrategy(num_gpus = GPU_NUM)
下一步在model.train()
处失败 所以我的错误:
++++++++++++++++错误:++++++++++++++++

    /usr/bin/python3.5 /home/user/SOME_PATH/bin/my_tf_script.py
Data #1 loaded
Data loading complete
INFO:tensorflow:Using config: {'_master': '', '_task_type': 'worker', '_service': None, '_save_checkpoints_steps': None, '_tf_random_seed': None, '_session_config': None, '_save_summary_steps': 100, '_is_chief': True, '_global_id_in_cluster': 0, '_save_checkpoints_secs': 600, '_keep_checkpoint_every_n_hours': 10000, '_keep_checkpoint_max': 5, '_task_id': 0, '_num_worker_replicas': 1, '_model_dir': './checkpoints_train/', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f51e086b4a8>, '_device_fn': None, '_evaluation_master': '', '_num_ps_replicas': 0, '_log_step_count_steps': 100, '_train_distribute': <tensorflow.contrib.distribute.python.mirrored_strategy.MirroredStrategy object at 0x7f51e086b5f8>}
2018-07-14 00:04:57.193208: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1392] Found device 0 with properties: 
name: GeForce GTX 1080 major: 6 minor: 1 memoryClockRate(GHz): 1.7335
pciBusID: 0000:83:00.0
totalMemory: 7.92GiB freeMemory: 7.80GiB
2018-07-14 00:04:57.489755: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1392] Found device 1 with properties: 
name: GeForce GTX 1080 major: 6 minor: 1 memoryClockRate(GHz): 1.7335
pciBusID: 0000:84:00.0
totalMemory: 7.92GiB freeMemory: 7.80GiB
2018-07-14 00:04:57.491371: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1471] Adding visible gpu devices: 0, 1
2018-07-14 00:04:58.205491: I tensorflow/core/common_runtime/gpu/gpu_device.cc:952] Device interconnect StreamExecutor with strength 1 edge matrix:
2018-07-14 00:04:58.205548: I tensorflow/core/common_runtime/gpu/gpu_device.cc:958]      0 1 
2018-07-14 00:04:58.205562: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] 0:   N Y 
2018-07-14 00:04:58.205575: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] 1:   Y N 
2018-07-14 00:04:58.206129: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1084] Created TensorFlow device (/device:GPU:0 with 7534 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1080, pci bus id: 0000:83:00.0, compute capability: 6.1)
2018-07-14 00:04:58.282580: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1084] Created TensorFlow device (/device:GPU:1 with 7534 MB memory) -> physical GPU (device: 1, name: GeForce GTX 1080, pci bus id: 0000:84:00.0, compute capability: 6.1)
INFO:tensorflow:Device is available but not used by distribute strategy: /device:CPU:0
INFO:tensorflow:Configured nccl all-reduce.
2018-07-14 00:04:58.438084: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1471] Adding visible gpu devices: 0, 1
2018-07-14 00:04:58.438251: I tensorflow/core/common_runtime/gpu/gpu_device.cc:952] Device interconnect StreamExecutor with strength 1 edge matrix:
2018-07-14 00:04:58.438271: I tensorflow/core/common_runtime/gpu/gpu_device.cc:958]      0 1 
2018-07-14 00:04:58.438298: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] 0:   N Y 
2018-07-14 00:04:58.438315: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] 1:   Y N 
2018-07-14 00:04:58.438712: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1084] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 7534 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1080, pci bus id: 0000:83:00.0, compute capability: 6.1)
2018-07-14 00:04:58.438878: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1084] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 7534 MB memory) -> physical GPU (device: 1, name: GeForce GTX 1080, pci bus id: 0000:84:00.0, compute capability: 6.1)
INFO:tensorflow:Calling model_fn.
TRAINABLE_VARIABLES: [<tf.Variable 'V:0' shape=(3449, 13, 64) dtype=float32_ref>, <tf.Variable 'S:0' shape=(902113, 64) dtype=float32_ref>]
TRAINABLE_VARIABLES[:1]: [<tf.Variable 'V:0' shape=(3449, 13, 64) dtype=float32_ref>]
TRAINABLE_VARIABLES[1:]: [<tf.Variable 'S:0' shape=(902113, 64) dtype=float32_ref>]
INFO:tensorflow:Calling model_fn.
TRAINABLE_VARIABLES: [<tf.Variable 'V:0' shape=(3449, 13, 64) dtype=float32_ref>, <tf.Variable 'S:0' shape=(902113, 64) dtype=float32_ref>, <tf.Variable 'tower_1/V:0' shape=(3449, 13, 64) dtype=float32_ref>, <tf.Variable 'tower_1/S:0' shape=(902113, 64) dtype=float32_ref>]
TRAINABLE_VARIABLES[:1]: [<tf.Variable 'V:0' shape=(3449, 13, 64) dtype=float32_ref>]
TRAINABLE_VARIABLES[1:]: [<tf.Variable 'S:0' shape=(902113, 64) dtype=float32_ref>, <tf.Variable 'tower_1/V:0' shape=(3449, 13, 64) dtype=float32_ref>, <tf.Variable 'tower_1/S:0' shape=(902113, 64) dtype=float32_ref>]
INFO:tensorflow:batch_all_reduce invoked for batches size = 1 with algorithm = nccl, num_packs = 1, agg_small_grads_max_bytes = 0 and agg_small_grads_max_group = 10
WARNING:tensorflow:Efficient allreduce is not supported for IndexedSlices.
INFO:tensorflow:Error reported to Coordinator: 
Traceback (most recent call last):
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/coordinator.py", line 297, in stop_on_exception
    yield
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py", line 273, in _call_for_each_tower
    self, *merge_args, **merge_kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/optimizer.py", line 685, in _distributed_apply
    for grad, var in grads_and_vars
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/optimizer.py", line 686, in <listcomp>
    for op in distribution.unwrap(distribution.update(var, update, grad))
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/distribute.py", line 894, in update
    return self._update(var, fn, *args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py", line 325, in _update
    assert isinstance(var, values.MirroredVariable)
AssertionError
Traceback (most recent call last):
  File "tst.py", line 198, in <module>
    model.train(input_fn=make_full_data_dataset, steps=300000)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py", line 375, in train
    loss = self._train_model(input_fn, hooks, saving_listeners)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py", line 1131, in _train_model
    return self._train_model_distributed(input_fn, hooks, saving_listeners)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py", line 1171, in _train_model_distributed
    self.config)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/distribute.py", line 811, in call_for_each_tower
    return self._call_for_each_tower(fn, *args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py", line 279, in _call_for_each_tower
    coord.join(threads)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/coordinator.py", line 389, in join
    six.reraise(*self._exc_info_to_raise)
  File "/usr/local/lib/python3.5/dist-packages/six.py", line 693, in reraise
    raise value
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/coordinator.py", line 297, in stop_on_exception
    yield
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py", line 273, in _call_for_each_tower
    self, *merge_args, **merge_kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/optimizer.py", line 685, in _distributed_apply
    for grad, var in grads_and_vars
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/optimizer.py", line 686, in <listcomp>
    for op in distribution.unwrap(distribution.update(var, update, grad))
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/distribute.py", line 894, in update
    return self._update(var, fn, *args, **kwargs)
  File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py", line 325, in _update
    assert isinstance(var, values.MirroredVariable)
AssertionError

0 个答案:

没有答案