环境: 1.框架:(TensorFlow,Keras,PyTorch,MXNet)Keras 2.框架版本:2.2.4 3. Horovod版本: 4. MPI版本: 5. CUDA版本:9.2 6. NCCL版本:2.2.12 7. Python版本:3 8.操作系统和版本:
清单: 1.您是否搜索过问题以查找是否有人之前曾问过这个问题? ÿ 2.如果您的问题是关于挂起的,您是否读过[本文档](https://github.com/horovod/horovod/blob/master/docs/running.md)? ÿ 3.如果您的问题与码头工人有关,您是否读过this doc? n
您的问题: 抱歉,如果我错过了某些版本的#s,我真的很想开始在Databricks上使用horovod。我的任务是在数据砖上运行大型人脸分类算法,该算法已配置了多个集群,特别是有两个实例的AWS EC2 p2.xlarge类型的工作程序和一个配置相同的驱动程序。它们每个都包含一个GPU并具有11GB的视频内存。
我试图根据Databricks网站上发布的教程来设置水平仪,然后粘贴以下代码(删除了不必要的部分):
# |||||||||||||||||||||||||||||||||||||||||||||||||||||| #
# modified version to utilize horovod for distributed dl #
# |||||||||||||||||||||||||||||||||||||||||||||||||||||| #
#! /usr/bin/env python
import os
import horovod.keras as hvd
import numpy as np
import tensorflow as tf
# from keras.applications.mobilenet import DepthwiseConv2D
from keras import backend as K
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adadelta
from keras.layers import Activation, BatchNormalization, Conv2D, Dense, Dropout, Flatten, GaussianDropout, \
GlobalAveragePooling2D, MaxPooling2D, DepthwiseConv2D
from keras.models import Sequential, load_model
from keras.preprocessing import image
from keras.utils import plot_model
from keras.utils.np_utils import to_categorical
from sklearn.utils import class_weight
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
CLASSIFY = 0
REGRESS = 1
# OTPIONS #
IMAGE_SIZE = 128
# batch_size changed from 32 -> 512
def load_images(C_or_R, paths, labels, batch_size=512, eval=False):
batch_n = 0
while True:
batch_d = []
batch_l = []
for i in range(batch_size):
if batch_n * batch_size + i > len(paths) - 1:
batch_n = 0
path = paths[batch_n * batch_size + i]
img = image.load_img(path, target_size=(IMAGE_SIZE, IMAGE_SIZE))
x = image.img_to_array(img) / 255
if not eval:
x = image.random_rotation(x, 20)
x = image.random_shift(x, 0.1, 0.1)
# if np.random.random() < 0.5:
# x = image.flip_axis(x, 1)
y = labels[batch_n * batch_size + i]
batch_d.append(x)
batch_l.append(y)
batch_d = np.array(batch_d).reshape((batch_size, IMAGE_SIZE, IMAGE_SIZE, 3))
if C_or_R == CLASSIFY:
batch_l = np.array(batch_l).reshape((batch_size, 8))
else:
batch_l = np.array(batch_l).reshape((batch_size, 2))
yield (batch_d, batch_l)
batch_n += 1
def process_data(C_or_R, paths, labels):
labels_out = []
paths_out = []
count = 0
for i, (emotion, valence, arousal) in enumerate(labels):
if C_or_R == CLASSIFY:
if emotion > 7:
# ignore invalid emotions
continue
labels_out.append(emotion)
paths_out.append(paths[i])
else:
if arousal == -2 or valence == -2:
# ignore invalid values
continue
labels_out.append([valence, arousal])
paths_out.append(paths[i])
count += 1
print('Processed:', count, end='\r')
if C_or_R == CLASSIFY:
weights = class_weight.compute_class_weight('balanced', np.unique(labels_out), labels_out)
weights = dict(enumerate(weights))
labels_out = to_categorical(labels_out, num_classes=8)
else:
weights = None
print('Processed:', count)
return paths_out, labels_out, weights
# added additional param – learning_rate #
def vgg_style_model(C_or_R, learning_rate, dropout=(0.2, 0.5)):
model = Sequential()
# CONV BLOCK 1
model.add(Conv2D(16, (3, 3), input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3), padding='same', use_bias=False))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Conv2D(16, (3, 3), padding='same', use_bias=False))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
model.add(GaussianDropout(dropout[0]))
# CONV BLOCK 2
model.add(Conv2D(32, (3, 3), padding='same', use_bias=False))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3), padding='same', use_bias=False))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
model.add(GaussianDropout(dropout[0]))
# CONV BLOCK 3
model.add(Conv2D(64, (3, 3), padding='same', use_bias=False))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3), padding='same', use_bias=False))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
model.add(GaussianDropout(dropout[0]))
# CONV BLOCK 4
model.add(Conv2D(128, (3, 3), padding='same', use_bias=False))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Conv2D(128, (3, 3), padding='same', use_bias=False))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
model.add(GaussianDropout(dropout[0]))
# CONV BLOCK 5
model.add(Conv2D(128, (3, 3), padding='same', use_bias=False))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Conv2D(128, (3, 3), padding='same', use_bias=False))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2), padding='same'))
model.add(GaussianDropout(dropout[0]))
# flatten
model.add(Flatten())
# dense 1
model.add(Dense(1024, use_bias=False))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(dropout[1]))
# dense 2
model.add(Dense(1024, use_bias=False))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(dropout[1]))
# Horovod: adjust learning rate based on number of GPUs.
optimizer = Adadelta(learning_rate * hvd.size())
# Horovod: wrap optimizer with Horovod DistributedOptimizer.
optimizer = hvd.DistributedOptimizer(optimizer)
# OUTPUT
if C_or_R == CLASSIFY:
model.add(Dense(8, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
else:
model.add(Dense(2, activation='linear'))
model.compile(loss='mean_squared_error', optimizer=optimizer)
return model
def regressor_from_classifier(model, drop=False):
# REMOVE CLASSIFIER LAYER
model.layers.pop()
model.outputs = [model.layers[-1].output]
model.layers[-1].outbound_nodes = []
# ADD REGRESSOR OUTPUT
if drop:
model.add(Dropout(0.3))
model.add(Dense(2, activation='linear', name='regressor_output'))
model.compile(loss='mean_squared_error', optimizer='adam')
return model
def load_and_save(model, m):
model.load_weights(m + '.h5')
for layer in model.layers:
if type(layer) is Dropout or type(layer) is GaussianDropout:
model.layers.remove(layer)
model.save(m + '_out.h5')
def visualise(model, name):
plot_model(model, to_file=name + '.png', show_shapes=True, show_layer_names=False)
def train(C_or_R, model, output_path, epochs, batch_size):
print('** LOADING DATA **')
t_paths = np.load('/dbfs/FileStore/tables/Emosic/training_paths.npy')
t_labels = np.load('/dbfs/FileStore/tables/Emosic/training_labels.npy')
t_paths, t_labels, t_weights = process_data(C_or_R, t_paths, t_labels)
v_paths = np.load('/dbfs/FileStore/tables/Emosic/validation_paths.npy')
v_labels = np.load('/dbfs/FileStore/tables/Emosic/validation_labels.npy')
v_paths, v_labels, v_weights = process_data(C_or_R, v_paths, v_labels)
print('** FITTING MODEL **')
t_steps = len(t_labels) // batch_size
print('** DONE t_steps **')
v_steps = len(v_labels) // batch_size
print('** DONE v_steps **')
# Horovod: Broadcast initial variable states from rank 0
# to all other processes. This is necessary to ensure
# consistent initialization of all workers when training is
# started with random weights or restored from a checkpoint.
callbacks = [hvd.callbacks.BroadcastGlobalVariablesCallback(0)]
# Horovod: Save checkpoints only on worker 0 to prevent
# other workers from overwriting and corrupting them.
if hvd.rank() == 0:
callbacks.append(ModelCheckpoint(output_path + '_T.h5', save_weights_only=True))
if C_or_R == CLASSIFY:
print('** START FITTING MODEL **')
history = model.fit_generator(
load_images(C_or_R, t_paths, t_labels, batch_size),
steps_per_epoch=t_steps,
class_weight=t_weights,
epochs=epochs,
validation_data=load_images(C_or_R, v_paths, v_labels, batch_size, eval=True),
validation_steps=v_steps,
callbacks=callbacks)
else:
history = model.fit_generator(
load_images(C_or_R, t_paths, t_labels, batch_size),
steps_per_epoch=t_steps,
epochs=epochs,
validation_data=load_images(C_or_R, v_paths, v_labels, batch_size, eval=True),
validation_steps=v_steps,
callbacks=callbacks)
print('** EXPORTING MODEL **')
np.save(output_path + '_HIST', history.history)
for layer in model.layers:
if type(layer) is Dropout or type(layer) is GaussianDropout:
model.layers.remove(layer)
model_json = model.to_json()
with open(output_path + '_ARCH.json', 'w') as json_file:
json_file.write(model_json)
model.save_weights(output_path + '_WEIGHTS.h5')
model.save(output_path + '_FULL.h5')
# if __name__ == '__main__':
#print('Haven\'t got around to a CLI for this')
#print('Choose a model from mobilenet_style_model, vgg_style_model, alexnet_style_model')
#print('C_or_R means classifier or regressor')
#print('Call train() on the model with appropriate C_or_R, output and train options')
#print('A pretrained classifier can be loaded and passed to regressor_from_classifier() to get a regression model which can be trained as normal with C_or_R=REGRESS')
#print('load_and_save() is used to trim unneeded data from a model to get the true output model, give it a fresh initialization of the right mdoel type and path to the trained model')
def train_hvd(learning_rate=1.0):
print('** INITIALIZING HOROVOD FOR DISTRIBUTED DL **')
hvd.init()
# Horovod: pin GPU to be used to process local rank (one GPU per process)
config = tf.ConfigProto()
# config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = str(hvd.local_rank())
K.set_session(tf.Session(config=config))
# test for batch size of 256
train(0, vgg_style_model(0, learning_rate, [0.2, 0.5]), "/dbfs/FileStore/tables/Emosic/model/M_VGG/", 1, 256)
----------------------------代码完成----------------- --------- 但是一旦进入试装阶段,工作人员就会在fit_generator()中挂起。我必须承认,我没有任何工作背景,作为计算机科学研究生,我正在尝试学习并实际将算法应用于具有分布式学习的云中。但这没有提供任何错误信息,我也不知道如何解决该问题,因此,感谢您的帮助。
我还从两个工作人员那里复制了stderr日志:
### stderr log page for app-20190403143014-0000/0 ###
[1,1]<stdout>:Processed: 3776
[1,1]<stdout>:** FITTING MODEL **
[1,1]<stdout>:** DONE t_steps **
[1,1]<stdout>:** DONE v_steps **
[1,1]<stdout>:** START FITTING MODEL **
[1,0]<stdout>:Epoch 1/1
[1,1]<stdout>:Epoch 1/1
[1,0]<stdout>:
[1,0]<stdout>:0402-145914-wins357-10-102-177-188:2132:2137 [0] misc/ibvwrap.cu:61 WARN Failed to open libibverbs.so[.1]
[1,0]<stdout>:0402-145914-wins357-10-102-177-188:2132:2137 [0] INFO Using internal Network Socket
[1,0]<stdout>:0402-145914-wins357-10-102-177-188:2132:2137 [0] INFO Using NCCL Low-latency algorithm for sizes below 16384
[1,0]<stdout>:0402-145914-wins357-10-102-177-188:2132:2137 [0] INFO NET : Using interface eth0:10.102.177.188<0>
[1,0]<stdout>:0402-145914-wins357-10-102-177-188:2132:2137 [0] INFO NET/Socket : 1 interfaces found
[1,0]<stdout>:NCCL version 2.2.12+cuda9.2
[1,1]<stdout>:
[1,1]<stdout>:0402-145914-wins357-10-102-177-169:2165:2173 [0] misc/ibvwrap.cu:61 WARN Failed to open libibverbs.so[.1]
[1,1]<stdout>:0402-145914-wins357-10-102-177-169:2165:2173 [0] INFO Using internal Network Socket
[1,1]<stdout>:0402-145914-wins357-10-102-177-169:2165:2173 [0] INFO Using NCCL Low-latency algorithm for sizes below 16384
[1,1]<stdout>:0402-145914-wins357-10-102-177-169:2165:2173 [0] INFO comm 0x7f340c33f5c0 rank 1 nranks 2
[1,1]<stdout>:0402-145914-wins357-10-102-177-169:2165:2173 [0] INFO NET : Using interface eth0:10.102.177.169<0>
[1,1]<stdout>:0402-145914-wins357-10-102-177-169:2165:2173 [0] INFO NET/Socket : 1 interfaces found
[1,0]<stdout>:0402-145914-wins357-10-102-177-188:2132:2137 [0] INFO comm 0x7f9c744ba650 rank 0 nranks 2
[1,0]<stdout>:0402-145914-wins357-10-102-177-188:2132:2137 [0] INFO Using 128 threads
[1,0]<stdout>:0402-145914-wins357-10-102-177-188:2132:2137 [0] INFO Min Comp Cap 3
[1,0]<stdout>:0402-145914-wins357-10-102-177-188:2132:2137 [0] INFO NCCL_SINGLE_RING_THRESHOLD=131072
[1,0]<stdout>:0402-145914-wins357-10-102-177-188:2132:2137 [0] INFO Ring 00 : 0 1
[1,0]<stdout>:0402-145914-wins357-10-102-177-188:2132:2137 [0] INFO Ring 01 : 0 1
[1,1]<stdout>:0402-145914-wins357-10-102-177-169:2165:2173 [0] INFO 0 -> 1 via NET/Socket/0
[1,0]<stdout>:0402-145914-wins357-10-102-177-188:2132:2137 [0] INFO 1 -> 0 via NET/Socket/0
[1,1]<stdout>:0402-145914-wins357-10-102-177-169:2165:2173 [0] INFO 0 -> 1 via NET/Socket/0
[1,0]<stdout>:0402-145914-wins357-10-102-177-188:2132:2137 [0] INFO 1 -> 0 via NET/Socket/0
[1,0]<stdout>:0402-145914-wins357-10-102-177-188:2132:2137 [0] INFO Launch mode Parallel
### stderr log page for app-20190403143014-0000/1 ###
19/04/03 14:31:55 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 3.7 KB, free 23.1 GB)
19/04/03 14:31:55 INFO TorrentBroadcast: Reading broadcast variable 0 took 136 ms
19/04/03 14:31:55 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 5.4 KB, free 23.1 GB)
Using TensorFlow backend.
2019-04-03 14:31:57.662179: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2019-04-03 14:31:57.770768: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:964] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2019-04-03 14:31:57.771162: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1432] Found device 0 with properties:
name: Tesla K80 major: 3 minor: 7 memoryClockRate(GHz): 0.8235
pciBusID: 0000:00:1e.0
totalMemory: 11.17GiB freeMemory: 11.10GiB
2019-04-03 14:31:57.771201: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1511] Adding visible gpu devices: 0
2019-04-03 14:31:58.096089: I tensorflow/core/common_runtime/gpu/gpu_device.cc:982] Device interconnect StreamExecutor with strength 1 edge matrix:
2019-04-03 14:31:58.096157: I tensorflow/core/common_runtime/gpu/gpu_device.cc:988] 0
2019-04-03 14:31:58.096176: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1001] 0: N
2019-04-03 14:31:58.096468: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1115] Created TensorFlow device (/device:GPU:0 with 10757 MB memory) -> physical GPU (device: 0, name: Tesla K80, pci bus id: 0000:00:1e.0, compute capability: 3.7)
19/04/03 14:31:58 INFO PythonRunner: Times: total = 2539, boot = 448, init = 1656, finish = 435
19/04/03 14:31:58 INFO Executor: Finished task 0.0 in stage 0.0 (TID 0). 1601 bytes result sent to driver
19/04/03 14:31:58 INFO CoarseGrainedExecutorBackend: Got assigned task 2
19/04/03 14:31:58 INFO Executor: Running task 1.0 in stage 1.0 (TID 2)
19/04/03 14:31:58 INFO TorrentBroadcast: Started reading broadcast variable 2
19/04/03 14:31:58 INFO MemoryStore: Block broadcast_2_piece0 stored as bytes in memory (estimated size 7.7 KB, free 23.1 GB)
19/04/03 14:31:58 INFO TorrentBroadcast: Reading broadcast variable 2 took 10 ms
19/04/03 14:31:58 INFO MemoryStore: Block broadcast_2 stored as values in memory (estimated size 10.6 KB, free 23.1 GB)
19/04/03 14:31:58 INFO TorrentBroadcast: Started reading broadcast variable 1
19/04/03 14:31:58 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 5.0 KB, free 23.1 GB)
19/04/03 14:31:58 INFO TorrentBroadcast: Reading broadcast variable 1 took 11 ms
19/04/03 14:31:58 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 488.0 B, free 23.1 GB)
Using TensorFlow backend.
19/04/03 14:32:00 INFO BarrierTaskContext: Task 2 from Stage 1(Attempt 0) has entered the global sync, current barrier epoch is 0.
19/04/03 14:32:02 INFO BarrierTaskContext: Task 2 from Stage 1(Attempt 0) finished global sync successfully, waited for 2 seconds, current barrier epoch is 1.
19/04/03 14:32:02 INFO BarrierTaskContext: Task 2 from Stage 1(Attempt 0) has entered the global sync, current barrier epoch is 1.
19/04/03 14:33:02 INFO BarrierTaskContext: Task 2 from Stage 1(Attempt 0) waiting under the global sync since 1554301922116, has been waiting for 60 seconds, current barrier epoch is 1.
19/04/03 14:34:02 INFO BarrierTaskContext: Task 2 from Stage 1(Attempt 0) waiting under the global sync since 1554301922116, has been waiting for 120 seconds, current barrier epoch is 1.
19/04/03 14:35:02 INFO BarrierTaskContext: Task 2 from Stage 1(Attempt 0) waiting under the global sync since 1554301922116, has been waiting for 180 seconds, current barrier epoch is 1.
19/04/03 14:36:02 INFO BarrierTaskContext: Task 2 from Stage 1(Attempt 0) waiting under the global sync since 1554301922116, has been waiting for 240 seconds, current barrier epoch is 1.
19/04/03 14:37:02 INFO BarrierTaskContext: Task 2 from Stage 1(Attempt 0) waiting under the global sync since 1554301922116, has been waiting for 300 seconds, current barrier epoch is 1.
因此,显然,第二个工作人员正在等待第一个工作人员,但是第一个工作人员只是挂在了启动并行模式的位置。但为什么?我是否错过了应该进行的任何配置?预先感谢!