扭矩工作与mpirun困住

时间:2017-08-25 09:11:19

标签: linux mpi batch-processing pbs torque

我运行了以下PBS脚本,以使用我新配置的Torque运行作业:

#!/bin/sh
#PBS -N asyn
#PBS -q batch
#PBS -l nodes=2:ppn=2
#PBS -l walltime=120:00:00
cd $PBS_O_WORKDIR
cat $PBS_NODEFILE>nodes
mpirun -np 4 gmx_mpi mdrun -deffnm asyn_10ns

它陷入R状态。时间仍然是00:00:00。当我检查tracejob时,它会得到以下详细信息:

[root@headnode ~]# tracejob 11
/var/spool/torque/mom_logs/20170825: No such file or directory

Job: 11.headnode

08/25/2017 13:49:31.230 S    enqueuing into batch, state 1 hop 1
08/25/2017 13:49:31.360 S    Job Modified at request of root@headnode
08/25/2017 13:49:31.373 L    Job Run
08/25/2017 13:49:31.361 S    Job Run at request of root@headnode
08/25/2017 13:49:31.374 S    Not sending email: User does not want mail of this type.
08/25/2017 13:49:31  A    queue=batch
08/25/2017 13:49:31  A    user=souparno group=souparno jobname=asyn queue=batch ctime=1503649171 qtime=1503649171 etime=1503649171
                          start=1503649171 owner=souparno@headnode exec_host=headnode3/0-1+headnode2/0-1 Resource_List.nodes=2:ppn=2
                          Resource_List.walltime=120:00:00 Resource_List.nodect=2 Resource_List.neednodes=2:ppn=2 

sched_log给了我以下详细信息:

08/25/2017 13:49:31.373;64; pbs_sched.25166;Job;11.headnode;Job Run

server_log提供以下输出:

08/25/2017 13:49:31.230;256;PBS_Server.25216;Job;11.headnode;enqueuing into batc
h, state 1 hop 1
08/25/2017 13:49:31.230;08;PBS_Server.25216;Job;perform_commit_work;job_id: 11.headnode
08/25/2017 13:49:31.230;02;PBS_Server.25216;node;close_conn;Closing connection 8 and calling its accompanying function on close
08/25/2017 13:49:31.360;08;PBS_Server.25134;Job;11.headnode;Job Modified at request of root@headnode
08/25/2017 13:49:31.361;08;PBS_Server.25134;Job;11.headnode;Job Run at request of root@headnode
08/25/2017 13:49:31.374;13;PBS_Server.25134;Job;11.headnode;Not sending email: User does not want mail of this type.
08/25/2017 13:50:59.137;02;PBS_Server.25119;Svr;PBS_Server;Torque Server Version = 6.1.1.1, loglevel = 0

这项工作可能遇到的问题是什么?要提及文件"节点",它也不会被创建。

pbsnodes -a给出以下输出:

[root@headnode ~]# pbsnodes -a
headnode2
     state = free
     power_state = Running
     np = 22
     ntype = cluster
     jobs = 0-1/18.headnode
     status = opsys=linux,uname=Linux headnode2 2.6.32-358.el6.x86_64 #1 SMP Tue Jan 29 11:47:41 EST 2013 x86_64,sessions=2406 3695 3699 3701 3731 3733 3757,nsessions=7,nusers=2,idletime=2901,totmem=82448372kb,availmem=80025348kb,physmem=49401852kb,ncpus=24,loadave=23.00,gres=,netload=1677000736,state=free,varattr= ,cpuclock=OnDemand:2301MHz,macaddr=34:40:b5:e5:4a:fa,version=6.1.1.1,rectime=1503919171,jobs=18.headnode
     mom_service_port = 15002
     mom_manager_port = 15003

headnode3
     state = free
     power_state = Running
     np = 22
     ntype = cluster
     jobs = 0-1/18.headnode
     status = opsys=linux,uname=Linux headnode3 2.6.32-358.el6.x86_64 #1 SMP Tue Jan 29 11:47:41 EST 2013 x86_64,sessions=3570 3574 3576 3602 3604 3628 3803 32545,nsessions=8,nusers=3,idletime=882,totmem=98996200kb,availmem=97047600kb,physmem=65949680kb,ncpus=24,loadave=16.00,gres=,netload=1740623635,state=free,varattr= ,cpuclock=OnDemand:2301MHz,macaddr=34:40:b5:e5:43:52,version=6.1.1.1,rectime=1503919176,jobs=18.headnode
     mom_service_port = 15002
     mom_manager_port = 15003

headnode4
     state = free
     power_state = Running
     np = 22
     ntype = cluster
     status = opsys=linux,uname=Linux headnode4 2.6.32-358.el6.x86_64 #1 SMP Tue Jan 29 11:47:41 EST 2013 x86_64,sessions=3592 4057 27119,nsessions=3,nusers=1,idletime=73567,totmem=98991080kb,availmem=96941208kb,physmem=65944560kb,ncpus=24,loadave=23.99,gres=,netload=727722516,state=free,varattr= ,cpuclock=OnDemand:2200MHz,macaddr=34:40:b5:e5:49:8a,version=6.1.1.1,rectime=1503919177,jobs=
     mom_service_port = 15002
     mom_manager_port = 15003

headnode5
     state = free
     power_state = Running
     np = 22
     ntype = cluster
     status = opsys=linux,uname=Linux headnode5 2.6.32-358.el6.x86_64 #1 SMP Tue Jan 29 11:47:41 EST 2013 x86_64,sessions=17666,nsessions=1,nusers=1,idletime=2897,totmem=74170352kb,availmem=71840968kb,physmem=49397752kb,ncpus=24,loadave=23.04,gres=,netload=5756452931,state=free,varattr= ,cpuclock=OnDemand:2200MHz,macaddr=34:40:b5:e5:4a:a2,version=6.1.1.1,rectime=1503919174,jobs=
     mom_service_port = 15002
     mom_manager_port = 15003

headnode6
     state = free
     power_state = Running
     np = 22
     ntype = cluster
     status = opsys=linux,uname=Linux headnode6 2.6.32-358.el6.x86_64 #1 SMP Tue Jan 29 11:47:41 EST 2013 x86_64,sessions=3678 24197,nsessions=2,nusers=1,idletime=70315,totmem=98991080kb,availmem=97279540kb,physmem=65944560kb,ncpus=24,loadave=16.00,gres=,netload=711846161,state=free,varattr= ,cpuclock=OnDemand:2200MHz,macaddr=34:40:b5:e5:44:52,version=6.1.1.1,rectime=1503919171,jobs=
     mom_service_port = 15002
     mom_manager_port = 15003

0 个答案:

没有答案