我运行了以下PBS脚本,以使用我新配置的Torque运行作业:
#!/bin/sh
#PBS -N asyn
#PBS -q batch
#PBS -l nodes=2:ppn=2
#PBS -l walltime=120:00:00
cd $PBS_O_WORKDIR
cat $PBS_NODEFILE>nodes
mpirun -np 4 gmx_mpi mdrun -deffnm asyn_10ns
它陷入R状态。时间仍然是00:00:00。当我检查tracejob时,它会得到以下详细信息:
[root@headnode ~]# tracejob 11
/var/spool/torque/mom_logs/20170825: No such file or directory
Job: 11.headnode
08/25/2017 13:49:31.230 S enqueuing into batch, state 1 hop 1
08/25/2017 13:49:31.360 S Job Modified at request of root@headnode
08/25/2017 13:49:31.373 L Job Run
08/25/2017 13:49:31.361 S Job Run at request of root@headnode
08/25/2017 13:49:31.374 S Not sending email: User does not want mail of this type.
08/25/2017 13:49:31 A queue=batch
08/25/2017 13:49:31 A user=souparno group=souparno jobname=asyn queue=batch ctime=1503649171 qtime=1503649171 etime=1503649171
start=1503649171 owner=souparno@headnode exec_host=headnode3/0-1+headnode2/0-1 Resource_List.nodes=2:ppn=2
Resource_List.walltime=120:00:00 Resource_List.nodect=2 Resource_List.neednodes=2:ppn=2
sched_log给了我以下详细信息:
08/25/2017 13:49:31.373;64; pbs_sched.25166;Job;11.headnode;Job Run
server_log提供以下输出:
08/25/2017 13:49:31.230;256;PBS_Server.25216;Job;11.headnode;enqueuing into batc
h, state 1 hop 1
08/25/2017 13:49:31.230;08;PBS_Server.25216;Job;perform_commit_work;job_id: 11.headnode
08/25/2017 13:49:31.230;02;PBS_Server.25216;node;close_conn;Closing connection 8 and calling its accompanying function on close
08/25/2017 13:49:31.360;08;PBS_Server.25134;Job;11.headnode;Job Modified at request of root@headnode
08/25/2017 13:49:31.361;08;PBS_Server.25134;Job;11.headnode;Job Run at request of root@headnode
08/25/2017 13:49:31.374;13;PBS_Server.25134;Job;11.headnode;Not sending email: User does not want mail of this type.
08/25/2017 13:50:59.137;02;PBS_Server.25119;Svr;PBS_Server;Torque Server Version = 6.1.1.1, loglevel = 0
这项工作可能遇到的问题是什么?要提及文件"节点",它也不会被创建。
pbsnodes -a给出以下输出:
[root@headnode ~]# pbsnodes -a
headnode2
state = free
power_state = Running
np = 22
ntype = cluster
jobs = 0-1/18.headnode
status = opsys=linux,uname=Linux headnode2 2.6.32-358.el6.x86_64 #1 SMP Tue Jan 29 11:47:41 EST 2013 x86_64,sessions=2406 3695 3699 3701 3731 3733 3757,nsessions=7,nusers=2,idletime=2901,totmem=82448372kb,availmem=80025348kb,physmem=49401852kb,ncpus=24,loadave=23.00,gres=,netload=1677000736,state=free,varattr= ,cpuclock=OnDemand:2301MHz,macaddr=34:40:b5:e5:4a:fa,version=6.1.1.1,rectime=1503919171,jobs=18.headnode
mom_service_port = 15002
mom_manager_port = 15003
headnode3
state = free
power_state = Running
np = 22
ntype = cluster
jobs = 0-1/18.headnode
status = opsys=linux,uname=Linux headnode3 2.6.32-358.el6.x86_64 #1 SMP Tue Jan 29 11:47:41 EST 2013 x86_64,sessions=3570 3574 3576 3602 3604 3628 3803 32545,nsessions=8,nusers=3,idletime=882,totmem=98996200kb,availmem=97047600kb,physmem=65949680kb,ncpus=24,loadave=16.00,gres=,netload=1740623635,state=free,varattr= ,cpuclock=OnDemand:2301MHz,macaddr=34:40:b5:e5:43:52,version=6.1.1.1,rectime=1503919176,jobs=18.headnode
mom_service_port = 15002
mom_manager_port = 15003
headnode4
state = free
power_state = Running
np = 22
ntype = cluster
status = opsys=linux,uname=Linux headnode4 2.6.32-358.el6.x86_64 #1 SMP Tue Jan 29 11:47:41 EST 2013 x86_64,sessions=3592 4057 27119,nsessions=3,nusers=1,idletime=73567,totmem=98991080kb,availmem=96941208kb,physmem=65944560kb,ncpus=24,loadave=23.99,gres=,netload=727722516,state=free,varattr= ,cpuclock=OnDemand:2200MHz,macaddr=34:40:b5:e5:49:8a,version=6.1.1.1,rectime=1503919177,jobs=
mom_service_port = 15002
mom_manager_port = 15003
headnode5
state = free
power_state = Running
np = 22
ntype = cluster
status = opsys=linux,uname=Linux headnode5 2.6.32-358.el6.x86_64 #1 SMP Tue Jan 29 11:47:41 EST 2013 x86_64,sessions=17666,nsessions=1,nusers=1,idletime=2897,totmem=74170352kb,availmem=71840968kb,physmem=49397752kb,ncpus=24,loadave=23.04,gres=,netload=5756452931,state=free,varattr= ,cpuclock=OnDemand:2200MHz,macaddr=34:40:b5:e5:4a:a2,version=6.1.1.1,rectime=1503919174,jobs=
mom_service_port = 15002
mom_manager_port = 15003
headnode6
state = free
power_state = Running
np = 22
ntype = cluster
status = opsys=linux,uname=Linux headnode6 2.6.32-358.el6.x86_64 #1 SMP Tue Jan 29 11:47:41 EST 2013 x86_64,sessions=3678 24197,nsessions=2,nusers=1,idletime=70315,totmem=98991080kb,availmem=97279540kb,physmem=65944560kb,ncpus=24,loadave=16.00,gres=,netload=711846161,state=free,varattr= ,cpuclock=OnDemand:2200MHz,macaddr=34:40:b5:e5:44:52,version=6.1.1.1,rectime=1503919171,jobs=
mom_service_port = 15002
mom_manager_port = 15003