ptrace(PTRACE_CONT)无法恢复刚刚连接的进程

时间:2013-05-03 13:41:53

标签: resume ptrace

我正在编写一个程序,需要附加到其他进程(可能由我的程序的前一个实例创建)并在终止时进行监视。

如果我在我创建的进程的生命周期内保持程序运行,一切正常;但如果我启动一个进程,杀死我的程序,然后重新启动它,之前创建的进程将永远保持在STOPPED状态(似乎ptrace(PTRACE_CONT,...)无法恢复它)。代码片段附在下面:

static int exitFlag = 0;
static void sighandler (int/* signum */)
{ exitFlag = 1; }

int JsfNode::run (void)
{
    /* load jobs */
    {
        vector <JobInfo2> jobs;
        loadStruct <vector <JobInfo2> > (
                jobFile (), jobs);
        for (unsigned i=0 ; i<jobs.size () ; i++) {
            JobInfo2& info = jobs [i];
            string name = info.parm.name;
            if (m_jobs.find (name) == m_jobs.end ()) {
                Job2& job = m_jobs [name];
                job.info = info;
                /* trace it so that we can wait() it */
                switch (info.state) {
                case js2Active:
                case js2Canceling:
                case js2Suspending:
                if (ptrace (PTRACE_ATTACH, info.pid, 0, 0))
                        jdebug ("PTRACE_ATTACH failed for: %d (%s)\n", info.pid,
                                        strerror (errno));
                default: break;
                }
            }
        }
    }

    /* run until we are signaled to stop */
    signal (SIGINT, sighandler);
    while (!exitFlag)
        sleep (1);

    /* save jobs */
    {
        vector <JobInfo2> jobs;
        for (map <string, Job2>::iterator it=m_jobs.begin () ;
                        it!=m_jobs.end () ; it++) {
            JobInfo2& info = it->second.info;
            ptrace (PTRACE_DETACH, info.pid, NULL, NULL);
            jobs.push_back (info);
        }
        saveStruct <vector <JobInfo2> > (
                jobFile (), jobs);
    }

    return 0;
}

void JsfNode::startJob (Job2 & job)
{
    JobParm2 parm = job.info.parm;
    jdebug ("starting \"%s\"..\n", parm.name.c_str());

    /* get the uid of the run-as user */
    uid_t uid = 0;  /* run as root if the specified user is invalid */
    struct passwd * pwe = getpwnam (parm.user.c_str());
    if (pwe != NULL)
        uid = pwe->pw_uid;

    /* prepare the script file */
    string scriptfile = m_workdir+"/"+parm.name+"_scriptfile";
    ofstream ofscriptfile (scriptfile.c_str());
    ofscriptfile << parm.script;
    ofscriptfile.close();
    chown (scriptfile.c_str(), uid, uid);
    chmod (scriptfile.c_str(), S_IRWXU|S_IRWXG|S_IRWXO);

    /* prepare the MPIMACHINEFILE */
    string machinefile = m_workdir+"/"+parm.name+"_machinefile";
    ofstream ofmachinefile (machinefile.c_str());
    for (Resource::iterator it=parm.res.begin () ; it!=parm.res.end () ; it++)
        ofmachinefile << *it << ':' << parm.taskPerNode << '\n';
    ofmachinefile.close ();
    chown (machinefile.c_str(), uid, uid);
    chmod (machinefile.c_str(), S_IRWXU|S_IRWXG|S_IRWXO);

    /* prepare the redirection channels */
    int ipipe [2] = {-1,-1};
    int opipe [2] = {-1,-1};
    if (parm.redio > 0) {
        if (pipe (ipipe) == -1) {
            unlink:
            unlink (machinefile.c_str());
            unlink (scriptfile.c_str());
            return; /* do not fail the job, just try later */
        }
        if (pipe (opipe) == -1) {
            close:
            close (ipipe [0]);
            close (ipipe [1]);
            goto unlink;
        }
    }

    /* OK, fork it! -----------------> */

    pid_t pid;
    if ((pid = fork ()) == -1) {
        close (opipe [0]);
        close (opipe [1]);
        goto close;
    }

    if (pid == 0) {
        /* enable parent-tracing */
        ptrace (PTRACE_TRACEME, 0, NULL, NULL);

        /* drop the root privilege */
        setuid (uid);

        /* redirect stdin/stdout */
        if (parm.redio) {
            if (dup2 (ipipe [0],0)<0 ||
                dup2 (opipe [1],1)<0)
                exit (errno);
            close (ipipe [0]);
            close (ipipe [1]);
            close (opipe [0]);
            close (opipe [1]);
        }

        /* prepare the arguments/environments */
        char * arg[] = {
                strdup (scriptfile.c_str()),
                strdup (parm.args.c_str()),
                NULL    /* the required null entry */
        };
        setenv ("MPIMACHINEFILE", machinefile.c_str(), 1);
        setenv ("DISPLAY", parm.headNode.c_str(), 1);
        setenv ("JSF_JOBID", parm.name.c_str(), 1);

        /* execute it! ------> */
        execv (scriptfile.c_str(), arg);
        exit (errno);
    }

    /* redirect stdin/stdout */
    if (parm.redio) {
        close (ipipe [0]);
        close (opipe [1]);
        job.redPipe [0] = opipe [0];
        job.redPipe [1] = ipipe [1];
    }
    /* start the nurse thread */
    NurseData * nd = new NurseData (this, job);
    if (pthread_create (&job.nurseId, NULL, ::_jobnurse, nd) == 0)
        job.nurseActive = true;
    else delete nd;

    job.info.pid = pid;
    setJobState (job, js2Active);
        return;
}    

void JsfNode::monitorJob (Job2 & job)
{
    int status;
    pid_t pid = waitpid (job.info.pid, &status, WNOHANG);
    if (pid < 0) {
        if (errno == ECHILD) {
            /* the job process has disappeared.. */
            job.exitCode = 0;
            setJobState (job, js2Finished);
            return;
        }
    } else if (pid == job.info.pid) {
        if (WIFEXITED(status)) {
            job.exitCode = WEXITSTATUS(status);
            setJobState (job, js2Finished);
            return;
        } else if (WIFSIGNALED(status)) {
            setJobState (job, js2Canceled);
            return;
        } else if (WIFSTOPPED(status)) {
            if (ptrace (PTRACE_CONT, pid, NULL, NULL))
                jdebug ("PTRACE_CONT failed for: %d (%s)\n", pid, strerror(errno));
        }
    }

    /* ... */
}

1 个答案:

答案 0 :(得分:1)

是的,问题是由多线程引起的。如果monitorJob()在一个单独的线程中运行,ptrace(PTRACE_CONT)就会失败。将它移动到主线程(称为ptrace(PTRACE_ATTACH)的主线程)后,事情进展顺利。