新扭矩安装后所有节点都注册为关闭。我不知道为什么
[root@rbx-1 6.0.1]# pbsnodes -a
rbx-1
state = down
power_state = Running
np = 1
ntype = cluster
mom_service_port = 15002
mom_manager_port = 15003
rbx-2
state = down
power_state = Running
np = 1
ntype = cluster
mom_service_port = 15002
mom_manager_port = 15003
这是qmgr说的
[root@rbx-1 6.0.1]# qmgr -c 'p s'
create queue batch
set queue batch queue_type = Execution
set queue batch resources_default.nodes = 1
set queue batch resources_default.walltime = 01:00:00
set queue batch enabled = True
set queue batch started = True
#
# Set server attributes.
#
set server scheduling = True
set server acl_hosts = rbx-1
set server managers = root@rbx-1
set server operators = root@rbx-1
set server default_queue = batch
set server log_events = 2047
set server mail_from = adm
set server node_check_rate = 150
set server tcp_timeout = 300
set server job_stat_rate = 300
set server poll_jobs = True
set server down_on_error = True
set server mom_job_sync = True
set server keep_completed = 300
set server next_job_number = 0
set server moab_array_compatible = True
set server nppcu = 1
set server timeout_for_job_delete = 120
set server timeout_for_job_requeue = 120
请帮助 - 我不知道是什么导致这个或下一步尝试。关于教程或其他方面的任何想法都会有所帮助
答案 0 :(得分:0)
尝试运行momctl -d0 -h rbx-1
以查看MOM是否正在与服务器通信。确保server_name文件中的主机名与服务器和计算节点上的/ etc / hosts匹配。我猜你在节点上的/ etc / hosts中没有短名称。