如何从SGE获取(最近)失败的作业列表(失败= 100或exit_status = 137)?来自 qacct 帮助:
[-j [job_id|job_name|pattern]] list all [matching] jobs
我如何使用该模式?我尝试了以下,不起作用。
qacct -j failed=100
答案 0 :(得分:4)
“pattern”指的是一个简单的globbing表达式,以匹配一个作业名称,例如: qacct -j 'myjob*'
qacct
没有您正在寻找的过滤功能 - 可以过滤复杂的作业属性,但不能过滤exit_status
或failed
等基本属性。
您可以通过一点点工作从SGE会计文件中检索该信息(假设您可以访问它)。当SGE完成一项工作时,它会向$SGE_ROOT/$SGE_CELL/common/accounting
写一条简单的记录 - 这是qacct
读取的文件。您需要查看qmaster上的accounting(5)
手册页,了解GridEngine版本的详细信息,但会计文件中的作业记录应该或多或少看起来像这样:
all.q:myexechost:group:user:myjobstep16:1126971:sge:0:1369755166:1369768897:1369769771:0:0:874:796.564903:30.676336:15788.000000:0:0:0:0:17009:2:0:47987400.000000:34033048:0:0:0:9468:27604:NONE:defaultdepartment:NONE:1:0:827.241239:96.445328:39.111400:-q all.q:0.000000:NONE:237133824.000000:0:0
在此特定记录中,failed和exit_status分别是第12和第13个字段。对于快速而肮脏的“最近失败”列表,我们可以将这些与字段6(作业ID)和11(作业结束时间)一起使用,以便揭示最近100个作业中的任何失败:
$ cut -d':' -f6,11,12,13 $SGE_ROOT/$SGE_CELL/common/accounting|sort -t':' -k2|tail -100|grep ':100:137'
答案 1 :(得分:1)
我编写了一个python脚本来解析失败作业的记帐文件。您应该根据自己的需要进行编辑。
#!/usr/local/bin/python2.7
import os
from sys import *
import sys
import getopt
import datetime
#Variables
program = "parse_acct.py"
ifile = "/local/cluster/sge/default/common/accounting"
failed = 0
failedswitch = 0
subtime = 0
subtimeswitch = 0
begtime = 0
begtimeswitch = 0
endtime = 0
endtimeswitch = 0
user = 0
userswitch = 0
node = ""
nodeswitch = 0
### Read command line args
try:
myopts, args = getopt.getopt(sys.argv[1:],"i:f:n:t:u:b:e:h")
except getopt.GetoptError:
print program + " -i <input> -u <username> -n <node_name> -f"
sys.exit(2)
###############################
# o == option
# a == argument passed to the o
###############################
for o, a in myopts:
if o == '-f':
failed = a
failedswitch = 1
elif o == '-i':
ifile = a
elif o == '-u':
user = a
userswitch = 1
elif o == '-t':
subtime = a
subtimeswitch = 1
elif o == '-b':
begtime = a
begtimeswitch = 1
elif o == '-e':
endtime = a
endtimeswitch = 1
elif o == '-n':
node = a
nodeswitch = 1
elif o == '-h':
print program + " -i <input> -u <username> -n <node_name> -f"
sys.exit(0)
else:
print("Usage: %s -i <input> -u <username> -n <node_name> -f" % sys.argv[0])
sys.exit(0)
### --- Read line by line and import in to a list of lists --- ###
loi = []
f = open(ifile, "r")
for var in f:
line = var.rstrip().split(":")
if len(line) >= 10:
loi.append(line)
#print line
f.close()
### --- Parse through the list of lists and put a 0 to the beginning if it fails a test --- ###
for i in range(len(loi)):
if failedswitch == 1 and loi[i][11] >= 1: #!= failed:
loi[i][0] = [0]
elif userswitch == 1 and loi[i][3] != user:
loi[i][0] = [0]
elif nodeswitch == 1 and node != loi[i][1]:
loi[i][0] = [0]
# elif nodeswitch == 1 and node not in loi[i][1]:
# loi[i][0] = [0]
# elif nodeswitch == 1 and node not in loi[i][1]:
# loi[i][0] = [0]
# elif nodeswitch == 1 and node not in loi[i][1]:
# loi[i][0] = [0]
# elif nodeswitch == 1 and node not in loi[i][1]:
# loi[i][0] = [0]
### --- Remove all entries that have the "0" at the beginning --- ###
loidedup = [x for x in loi if x[0] != [0]
### --- Print out the files that passed all tests --- ###
for i in range(len(loidedup)):
print "=============================================================="
print "qname " + loidedup[i][0]
print "hostname " + loidedup[i][1]
print "group " + loidedup[i][2]
print "owner " + loidedup[i][3]
print "job_name " + loidedup[i][4]
print "job_number " + loidedup[i][5]
print "account " + loidedup[i][6]
print "priority " + loidedup[i][7]
print "submission_time " + datetime.datetime.fromtimestamp(int(loidedup[i][8])).strftime('%Y-%m-%d %H:%M:%S')
print "start_time " + datetime.datetime.fromtimestamp(int(loidedup[i][9])).strftime('%Y-%m-%d %H:%M:%S')
print "end_time " + datetime.datetime.fromtimestamp(int(loidedup[i][10])).strftime('%Y-%m-%d %H:%M:%S')
print "failed " + loidedup[i][11]
print "exit_status " + loidedup[i][12]
print "ru_wallclock " + loidedup[i][13]
print " ru_utime " + loidedup[i][14]
print " ru_stime " + loidedup[i][15]
print " ru_maxrss " + loidedup[i][16]
print " ru_ixrss " + loidedup[i][17]
print " ru_ismrss " + loidedup[i][18]
print " ru_idrss " + loidedup[i][19]
print " ru_isrss " + loidedup[i][20]
print " ru_minflt " + loidedup[i][21]
print " ru_majflt " + loidedup[i][22]
print " ru_nswap " + loidedup[i][23]
print " ru_inblock " + loidedup[i][24]
print " ru_oublock " + loidedup[i][25]
print " ru_msgsnd " + loidedup[i][26]
print " ru_msgrcv " + loidedup[i][27]
print " ru_nsignals " + loidedup[i][28]
print " ru_nvcsw " + loidedup[i][29]
print " ru_nivcsw " + loidedup[i][30]
print "project " + loidedup[i][31]
print "department " + loidedup[i][32]
print "granted_pe " + loidedup[i][33]
print "slots " + loidedup[i][34]
print "task_number " + loidedup[i][35]
print "cpu " + loidedup[i][36]
print "mem " + loidedup[i][37]
print "io " + loidedup[i][38]
print "category " + loidedup[i][39]
print "iow " + loidedup[i][40]
print "pe_taskid " + loidedup[i][41]
print "maxvmem " + loidedup[i][42]
print "arid " + loidedup[i][43]
print "ar_submission_time " + loidedup[i][44]
# print loidedup[i]