我的代码有问题。只是尝试使用MirroredStrategy功能,它对我不起作用,而且我只有一些断言错误;
0)我是否编写了自定义代码:
import math
import os
import time
import random
import numpy as np
import pickle
import errno
import tensorflow as tf
TOTAL_UTTERANCIES_COUNT = 902113
PREFETCH_SIZE = 20
N_for_V_and_S = 64
DATASETS_REPEAT_COUNT = 9999999999
BATCH_SIZE = 2000
GPU_NUM = 2
all_pdf_ids_set_filepath = "./pdf_ids_set_TOTAL.pckl"
def create_or_check_path(path):
try:
os.makedirs(path)
except OSError as e:
if e.errno != errno.EEXIST:
raise
def loadPickleData(file):
try:
with open(file, "rb") as f:
data = pickle.load(f)
return data
except Exception as e:
print("loadPickleData():")
print("EXCEPTION:", e)
print("file: ", file)
exit()
def saveDataAsPickle(file, data):
create_or_check_path(os.path.dirname(file))
with open(file, "wb") as f:
pickle.dump(data, f)
def avg_mfcc_path_with_idx(idx):
avg_mfcc_pickle_PATH = "./raw_mfcc_train.%d.ark.avg_mfcc.pckl" % idx
return avg_mfcc_pickle_PATH
def renew_data_to_idx(idx):
data_from_pickle = loadPickleData(avg_mfcc_path_with_idx(idx))
sorted_data_utt_names = sorted(list(data_from_pickle.keys()))
return data_from_pickle, sorted_data_utt_names
full_data_loaded_part, full_data_loaded_part_sorted_data_utt_names = renew_data_to_idx(1)
pdf_ids_set = loadPickleData(all_pdf_ids_set_filepath)
sorted_pdf_ids_list = sorted(list(pdf_ids_set))
TOTAL_PDF_IDS_COUNT = len(pdf_ids_set)
dict_pdf_ids_to_V_num = {}
dict_V_num_to_pdf_ids = {}
for i in range(len(sorted_pdf_ids_list)):
dict_pdf_ids_to_V_num.update({sorted_pdf_ids_list[i]: i})
dict_V_num_to_pdf_ids.update({i: sorted_pdf_ids_list[i]})
random_loaded_utt_name = random.choice(full_data_loaded_part_sorted_data_utt_names)
MFCC_SIZE = len(full_data_loaded_part[random_loaded_utt_name][list(full_data_loaded_part[random_loaded_utt_name].keys())[0]])
all_data_from_pickle = {}
all_sorted_data_utt_names = []
# for data_num in range(1, 49):
for data_num in range(1, 2):
data_from_pickle, sorted_data_utt_names = renew_data_to_idx(data_num)
all_data_from_pickle.update(data_from_pickle)
all_sorted_data_utt_names.extend(sorted_data_utt_names)
print("Data #%d loaded" % data_num, flush=True)
print("Data loading complete", flush=True)
def full_data_vectors_generator():
for idx_in_S in range(len(all_sorted_data_utt_names)):
output_U_data = np.zeros([TOTAL_PDF_IDS_COUNT, MFCC_SIZE], dtype=np.float32)
output_M_data = np.zeros([TOTAL_PDF_IDS_COUNT, MFCC_SIZE], dtype=np.float32)
utt_data = all_data_from_pickle[all_sorted_data_utt_names[idx_in_S]]
for pdf_id in list(utt_data.keys()):
"""
pdf_id here is 'str'. for example: '1694'
"""
idx_in_output_data = dict_pdf_ids_to_V_num[pdf_id]
output_U_data[idx_in_output_data] += utt_data[pdf_id]
output_M_data[idx_in_output_data] += 1 #mark vector
"""
yield S[i], U_data_vector, M_data_vector
"""
# print("DEBUG_counter:", tf.train.get_global_step(), flush=True)
yield ((np.reshape(output_U_data, [TOTAL_PDF_IDS_COUNT*MFCC_SIZE]), np.reshape(output_M_data, [TOTAL_PDF_IDS_COUNT*MFCC_SIZE])), idx_in_S)
def make_full_data_dataset():
full_data_dataset = tf.data.Dataset.from_generator(generator=full_data_vectors_generator,
output_types=((tf.float32, tf.float32), tf.int32),
output_shapes=((tf.TensorShape([None]), tf.TensorShape([None])),
tf.TensorShape(None)))
full_data_dataset = full_data_dataset.repeat(count=DATASETS_REPEAT_COUNT)
# full_data_dataset = full_data_dataset.shuffle(buffer_size=1000) #
full_data_dataset = full_data_dataset.batch(batch_size=BATCH_SIZE)
#TEST PREFETCH TO DEVICE
# transformation_function = tf.contrib.data.prefetch_to_device(device="/device:GPU:0", buffer_size=10)
# full_data_dataset = full_data_dataset.apply(transformation_func=transformation_function)
full_data_dataset = full_data_dataset.prefetch(buffer_size=PREFETCH_SIZE)
return full_data_dataset
def model_fn(features, labels, mode, params):
"""
Args:
features: This is the x-arg from the input_fn.
labels: This is the y-arg from the input_fn,
see e.g. train_input_fn for these two.
mode: Either TRAIN, EVAL, or PREDICT
params: User-defined hyper-parameters, e.g. learning-rate.
"""
U_data = features[0]
M_data = features[1]
S_idxs = labels
V = tf.Variable(tf.random_normal([TOTAL_PDF_IDS_COUNT, MFCC_SIZE, N_for_V_and_S], stddev=100.000001, mean=0.3), name="V")
S = tf.Variable(tf.random_normal([TOTAL_UTTERANCIES_COUNT, N_for_V_and_S], stddev=100.000001, mean=0.3), name="S")
if mode == tf.estimator.ModeKeys.PREDICT:
spec = tf.estimator.EstimatorSpec(mode=mode,
predictions=1)
else:
reshaped_V = tf.reshape(V, [TOTAL_PDF_IDS_COUNT*MFCC_SIZE, N_for_V_and_S])
loss_op = tf.reduce_sum(tf.square(tf.matmul(tf.gather(S, S_idxs), reshaped_V, transpose_b=True)-U_data)*M_data)
optimizer = tf.train.GradientDescentOptimizer(0.00000001)
print("TRAINABLE_VARIABLES:", tf.trainable_variables())
print("TRAINABLE_VARIABLES[:1]:", tf.trainable_variables()[:1])
print("TRAINABLE_VARIABLES[1:]:", tf.trainable_variables()[1:])
train_op = optimizer.minimize(loss=loss_op, global_step=tf.train.get_global_step(), var_list=tf.trainable_variables())
logging_hook = tf.train.LoggingTensorHook({"loss_custom_log_hook": loss_op}, every_n_iter=1)
spec = tf.estimator.EstimatorSpec(
mode=mode,
loss=loss_op,
train_op=train_op,
training_hooks=[logging_hook])
return spec
def make_var_dict_for_estimator(estmtr):
estim_model_var_names = estmtr.get_variable_names()
var_values_dict = {}
for var in estim_model_var_names:
var_values_dict[var] = model.get_variable_value(var)
return var_values_dict
tf.logging.set_verbosity(tf.logging.INFO)
if GPU_NUM > 1:
distribution = tf.contrib.distribute.MirroredStrategy(num_gpus=GPU_NUM)
elif GPU_NUM == 1:
distribution = None
else:
print("ERROR: WRONG GPU_NUM = %d" % GPU_NUM)
exit()
config = tf.estimator.RunConfig(train_distribute=distribution)
model = tf.estimator.Estimator(model_fn=model_fn,
params=[],
model_dir="./checkpoints_train/",
config=config)
model.train(input_fn=make_full_data_dataset, steps=300000)
"""
model.evaluate() will show some outputs, like 'loss' at final step and 'global_step':
for example: {'loss': 4.1929689e+12, 'global_step': 15}
"""
model_evaluate_result = model.evaluate(input_fn=make_full_data_dataset, steps=3)
print("Evaluate result", model_evaluate_result)
1)操作系统平台和发行版: -Linux UbuntuPC 4.4.0-130-通用#156-Ubuntu SMP周四6月14日08:53:28 UTC 2018 x86_64 x86_64 x86_64 GNU / Linux
2)从以下位置安装TensorFlow: sudo pip3.5安装***
3)TensorFlow版本:
检查了tf的两个版本:
a)1.10.nightly
tf_nightly_gpu-1.10.0.dev20180620-cp35-cp35m-manylinux1_x86_64.whl
b)1.9.rc2
tensorflow_gpu-1.9.0rc2-cp35-cp35m-manylinux1_x86_64.whl
4)Bazel版本:
提取Bazel安装...
构建标签:0.13.1
构建目标:bazel-out / k8-opt / bin / src / main / java / com / google / devtools / build / lib / bazel / BazelServer_deploy.jar
建立时间:2018年5月23日星期三11:17:23(1527074243)
建立时间戳记:1527074243
构建时间戳为int:1527074243
5)CUDA / cuDNN版本
CUDA-9.0.176.2
cuDNN-7.1.2
6)GPU模型和内存:
Nvidia GeForce GTX 1080 8Gb
7)精确复制命令:
$ python3.5 my_tf_script.py
当我仅使用1个GPU(第17行的GPU_NUM = 1)时,它可以工作。
但是当我使用“魔术词”-
distribution = tf.contrib.distribute.MirroredStrategy(num_gpus = GPU_NUM)
下一步在model.train()
处失败
所以我的错误:
++++++++++++++++错误:++++++++++++++++
/usr/bin/python3.5 /home/user/SOME_PATH/bin/my_tf_script.py
Data #1 loaded
Data loading complete
INFO:tensorflow:Using config: {'_master': '', '_task_type': 'worker', '_service': None, '_save_checkpoints_steps': None, '_tf_random_seed': None, '_session_config': None, '_save_summary_steps': 100, '_is_chief': True, '_global_id_in_cluster': 0, '_save_checkpoints_secs': 600, '_keep_checkpoint_every_n_hours': 10000, '_keep_checkpoint_max': 5, '_task_id': 0, '_num_worker_replicas': 1, '_model_dir': './checkpoints_train/', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f51e086b4a8>, '_device_fn': None, '_evaluation_master': '', '_num_ps_replicas': 0, '_log_step_count_steps': 100, '_train_distribute': <tensorflow.contrib.distribute.python.mirrored_strategy.MirroredStrategy object at 0x7f51e086b5f8>}
2018-07-14 00:04:57.193208: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1392] Found device 0 with properties:
name: GeForce GTX 1080 major: 6 minor: 1 memoryClockRate(GHz): 1.7335
pciBusID: 0000:83:00.0
totalMemory: 7.92GiB freeMemory: 7.80GiB
2018-07-14 00:04:57.489755: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1392] Found device 1 with properties:
name: GeForce GTX 1080 major: 6 minor: 1 memoryClockRate(GHz): 1.7335
pciBusID: 0000:84:00.0
totalMemory: 7.92GiB freeMemory: 7.80GiB
2018-07-14 00:04:57.491371: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1471] Adding visible gpu devices: 0, 1
2018-07-14 00:04:58.205491: I tensorflow/core/common_runtime/gpu/gpu_device.cc:952] Device interconnect StreamExecutor with strength 1 edge matrix:
2018-07-14 00:04:58.205548: I tensorflow/core/common_runtime/gpu/gpu_device.cc:958] 0 1
2018-07-14 00:04:58.205562: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] 0: N Y
2018-07-14 00:04:58.205575: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] 1: Y N
2018-07-14 00:04:58.206129: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1084] Created TensorFlow device (/device:GPU:0 with 7534 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1080, pci bus id: 0000:83:00.0, compute capability: 6.1)
2018-07-14 00:04:58.282580: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1084] Created TensorFlow device (/device:GPU:1 with 7534 MB memory) -> physical GPU (device: 1, name: GeForce GTX 1080, pci bus id: 0000:84:00.0, compute capability: 6.1)
INFO:tensorflow:Device is available but not used by distribute strategy: /device:CPU:0
INFO:tensorflow:Configured nccl all-reduce.
2018-07-14 00:04:58.438084: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1471] Adding visible gpu devices: 0, 1
2018-07-14 00:04:58.438251: I tensorflow/core/common_runtime/gpu/gpu_device.cc:952] Device interconnect StreamExecutor with strength 1 edge matrix:
2018-07-14 00:04:58.438271: I tensorflow/core/common_runtime/gpu/gpu_device.cc:958] 0 1
2018-07-14 00:04:58.438298: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] 0: N Y
2018-07-14 00:04:58.438315: I tensorflow/core/common_runtime/gpu/gpu_device.cc:971] 1: Y N
2018-07-14 00:04:58.438712: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1084] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 7534 MB memory) -> physical GPU (device: 0, name: GeForce GTX 1080, pci bus id: 0000:83:00.0, compute capability: 6.1)
2018-07-14 00:04:58.438878: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1084] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:1 with 7534 MB memory) -> physical GPU (device: 1, name: GeForce GTX 1080, pci bus id: 0000:84:00.0, compute capability: 6.1)
INFO:tensorflow:Calling model_fn.
TRAINABLE_VARIABLES: [<tf.Variable 'V:0' shape=(3449, 13, 64) dtype=float32_ref>, <tf.Variable 'S:0' shape=(902113, 64) dtype=float32_ref>]
TRAINABLE_VARIABLES[:1]: [<tf.Variable 'V:0' shape=(3449, 13, 64) dtype=float32_ref>]
TRAINABLE_VARIABLES[1:]: [<tf.Variable 'S:0' shape=(902113, 64) dtype=float32_ref>]
INFO:tensorflow:Calling model_fn.
TRAINABLE_VARIABLES: [<tf.Variable 'V:0' shape=(3449, 13, 64) dtype=float32_ref>, <tf.Variable 'S:0' shape=(902113, 64) dtype=float32_ref>, <tf.Variable 'tower_1/V:0' shape=(3449, 13, 64) dtype=float32_ref>, <tf.Variable 'tower_1/S:0' shape=(902113, 64) dtype=float32_ref>]
TRAINABLE_VARIABLES[:1]: [<tf.Variable 'V:0' shape=(3449, 13, 64) dtype=float32_ref>]
TRAINABLE_VARIABLES[1:]: [<tf.Variable 'S:0' shape=(902113, 64) dtype=float32_ref>, <tf.Variable 'tower_1/V:0' shape=(3449, 13, 64) dtype=float32_ref>, <tf.Variable 'tower_1/S:0' shape=(902113, 64) dtype=float32_ref>]
INFO:tensorflow:batch_all_reduce invoked for batches size = 1 with algorithm = nccl, num_packs = 1, agg_small_grads_max_bytes = 0 and agg_small_grads_max_group = 10
WARNING:tensorflow:Efficient allreduce is not supported for IndexedSlices.
INFO:tensorflow:Error reported to Coordinator:
Traceback (most recent call last):
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/coordinator.py", line 297, in stop_on_exception
yield
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py", line 273, in _call_for_each_tower
self, *merge_args, **merge_kwargs)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/optimizer.py", line 685, in _distributed_apply
for grad, var in grads_and_vars
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/optimizer.py", line 686, in <listcomp>
for op in distribution.unwrap(distribution.update(var, update, grad))
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/distribute.py", line 894, in update
return self._update(var, fn, *args, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py", line 325, in _update
assert isinstance(var, values.MirroredVariable)
AssertionError
Traceback (most recent call last):
File "tst.py", line 198, in <module>
model.train(input_fn=make_full_data_dataset, steps=300000)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py", line 375, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py", line 1131, in _train_model
return self._train_model_distributed(input_fn, hooks, saving_listeners)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/estimator/estimator.py", line 1171, in _train_model_distributed
self.config)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/distribute.py", line 811, in call_for_each_tower
return self._call_for_each_tower(fn, *args, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py", line 279, in _call_for_each_tower
coord.join(threads)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/coordinator.py", line 389, in join
six.reraise(*self._exc_info_to_raise)
File "/usr/local/lib/python3.5/dist-packages/six.py", line 693, in reraise
raise value
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/coordinator.py", line 297, in stop_on_exception
yield
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py", line 273, in _call_for_each_tower
self, *merge_args, **merge_kwargs)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/optimizer.py", line 685, in _distributed_apply
for grad, var in grads_and_vars
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/optimizer.py", line 686, in <listcomp>
for op in distribution.unwrap(distribution.update(var, update, grad))
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/training/distribute.py", line 894, in update
return self._update(var, fn, *args, **kwargs)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/contrib/distribute/python/mirrored_strategy.py", line 325, in _update
assert isinstance(var, values.MirroredVariable)
AssertionError