我正在编写一个程序,需要附加到其他进程(可能由我的程序的前一个实例创建)并在终止时进行监视。
如果我在我创建的进程的生命周期内保持程序运行,一切正常;但如果我启动一个进程,杀死我的程序,然后重新启动它,之前创建的进程将永远保持在STOPPED状态(似乎ptrace(PTRACE_CONT,...)无法恢复它)。代码片段附在下面:
static int exitFlag = 0;
static void sighandler (int/* signum */)
{ exitFlag = 1; }
int JsfNode::run (void)
{
/* load jobs */
{
vector <JobInfo2> jobs;
loadStruct <vector <JobInfo2> > (
jobFile (), jobs);
for (unsigned i=0 ; i<jobs.size () ; i++) {
JobInfo2& info = jobs [i];
string name = info.parm.name;
if (m_jobs.find (name) == m_jobs.end ()) {
Job2& job = m_jobs [name];
job.info = info;
/* trace it so that we can wait() it */
switch (info.state) {
case js2Active:
case js2Canceling:
case js2Suspending:
if (ptrace (PTRACE_ATTACH, info.pid, 0, 0))
jdebug ("PTRACE_ATTACH failed for: %d (%s)\n", info.pid,
strerror (errno));
default: break;
}
}
}
}
/* run until we are signaled to stop */
signal (SIGINT, sighandler);
while (!exitFlag)
sleep (1);
/* save jobs */
{
vector <JobInfo2> jobs;
for (map <string, Job2>::iterator it=m_jobs.begin () ;
it!=m_jobs.end () ; it++) {
JobInfo2& info = it->second.info;
ptrace (PTRACE_DETACH, info.pid, NULL, NULL);
jobs.push_back (info);
}
saveStruct <vector <JobInfo2> > (
jobFile (), jobs);
}
return 0;
}
void JsfNode::startJob (Job2 & job)
{
JobParm2 parm = job.info.parm;
jdebug ("starting \"%s\"..\n", parm.name.c_str());
/* get the uid of the run-as user */
uid_t uid = 0; /* run as root if the specified user is invalid */
struct passwd * pwe = getpwnam (parm.user.c_str());
if (pwe != NULL)
uid = pwe->pw_uid;
/* prepare the script file */
string scriptfile = m_workdir+"/"+parm.name+"_scriptfile";
ofstream ofscriptfile (scriptfile.c_str());
ofscriptfile << parm.script;
ofscriptfile.close();
chown (scriptfile.c_str(), uid, uid);
chmod (scriptfile.c_str(), S_IRWXU|S_IRWXG|S_IRWXO);
/* prepare the MPIMACHINEFILE */
string machinefile = m_workdir+"/"+parm.name+"_machinefile";
ofstream ofmachinefile (machinefile.c_str());
for (Resource::iterator it=parm.res.begin () ; it!=parm.res.end () ; it++)
ofmachinefile << *it << ':' << parm.taskPerNode << '\n';
ofmachinefile.close ();
chown (machinefile.c_str(), uid, uid);
chmod (machinefile.c_str(), S_IRWXU|S_IRWXG|S_IRWXO);
/* prepare the redirection channels */
int ipipe [2] = {-1,-1};
int opipe [2] = {-1,-1};
if (parm.redio > 0) {
if (pipe (ipipe) == -1) {
unlink:
unlink (machinefile.c_str());
unlink (scriptfile.c_str());
return; /* do not fail the job, just try later */
}
if (pipe (opipe) == -1) {
close:
close (ipipe [0]);
close (ipipe [1]);
goto unlink;
}
}
/* OK, fork it! -----------------> */
pid_t pid;
if ((pid = fork ()) == -1) {
close (opipe [0]);
close (opipe [1]);
goto close;
}
if (pid == 0) {
/* enable parent-tracing */
ptrace (PTRACE_TRACEME, 0, NULL, NULL);
/* drop the root privilege */
setuid (uid);
/* redirect stdin/stdout */
if (parm.redio) {
if (dup2 (ipipe [0],0)<0 ||
dup2 (opipe [1],1)<0)
exit (errno);
close (ipipe [0]);
close (ipipe [1]);
close (opipe [0]);
close (opipe [1]);
}
/* prepare the arguments/environments */
char * arg[] = {
strdup (scriptfile.c_str()),
strdup (parm.args.c_str()),
NULL /* the required null entry */
};
setenv ("MPIMACHINEFILE", machinefile.c_str(), 1);
setenv ("DISPLAY", parm.headNode.c_str(), 1);
setenv ("JSF_JOBID", parm.name.c_str(), 1);
/* execute it! ------> */
execv (scriptfile.c_str(), arg);
exit (errno);
}
/* redirect stdin/stdout */
if (parm.redio) {
close (ipipe [0]);
close (opipe [1]);
job.redPipe [0] = opipe [0];
job.redPipe [1] = ipipe [1];
}
/* start the nurse thread */
NurseData * nd = new NurseData (this, job);
if (pthread_create (&job.nurseId, NULL, ::_jobnurse, nd) == 0)
job.nurseActive = true;
else delete nd;
job.info.pid = pid;
setJobState (job, js2Active);
return;
}
void JsfNode::monitorJob (Job2 & job)
{
int status;
pid_t pid = waitpid (job.info.pid, &status, WNOHANG);
if (pid < 0) {
if (errno == ECHILD) {
/* the job process has disappeared.. */
job.exitCode = 0;
setJobState (job, js2Finished);
return;
}
} else if (pid == job.info.pid) {
if (WIFEXITED(status)) {
job.exitCode = WEXITSTATUS(status);
setJobState (job, js2Finished);
return;
} else if (WIFSIGNALED(status)) {
setJobState (job, js2Canceled);
return;
} else if (WIFSTOPPED(status)) {
if (ptrace (PTRACE_CONT, pid, NULL, NULL))
jdebug ("PTRACE_CONT failed for: %d (%s)\n", pid, strerror(errno));
}
}
/* ... */
}
答案 0 :(得分:1)
是的,问题是由多线程引起的。如果monitorJob()在一个单独的线程中运行,ptrace(PTRACE_CONT)就会失败。将它移动到主线程(称为ptrace(PTRACE_ATTACH)的主线程)后,事情进展顺利。