如何使用PTRACE获得多个线程的一致视图?

时间:2013-09-02 17:07:14

标签: c linux multithreading pthreads ptrace

在我工作on this question时,我遇到了一个使用ptrace的可能想法,但我无法正确理解ptrace如何与线程交互。

假设我有一个给定的多线程主进程,并且我想附加到其中的特定线程(可能来自一个分叉的子进程)。

  1. 我可以附加到特定的帖子吗? (手册在这个问题上存在分歧。)

  2. 如果是这样,那是否意味着单步执行只会逐步完成一个线程的指令?是否会停止所有进程的线程?

  3. 如果是这样,在我致电PTRACE_SYSCALLPTRACE_SINGLESTEP时,所有其他线程是否仍然停止,或者所有线程是否继续?有没有办法只在一个单独的线程中前进,但保证其他线程保持停止?

  4. 基本上,我想通过强制所有线程停止来同步原始程序,然后通过单步执行一个跟踪线程来执行一小组单线程指令。

    到目前为止,我的个人尝试看起来有点像这样:

    pid_t target = syscall(SYS_gettid);   // get the calling thread's ID
    pid_t pid = fork();
    
    if (pid > 0)
    {
        waitpid(pid, NULL, 0);            // synchronise main process
    
        important_instruction();
    }
    else if (pid == 0)
    {
        ptrace(target, PTRACE_ATTACH, NULL, NULL);    // does this work?
    
        // cancel parent's "waitpid" call, e.g. with a signal
    
        // single-step to execute "important_instruction()" above
    
       ptrace(target, PTRACE_DETACH, NULL, NULL);     // parent's threads resume?
    
       _Exit(0);
    }
    

    但是,我不确定,并且找不到合适的引用,这是同时正确的,并且只有当所有其他线程都停止时才保证important_instruction()被执行。我也明白,当父母收到来自其他地方的信号时,可能会有竞争条件,我听说我应该使用PTRACE_SEIZE,但这似乎并不存在。

    非常感谢任何澄清或参考!

4 个答案:

答案 0 :(得分:23)

我写了第二个测试用例。我不得不添加一个单独的答案,因为它太长而不适合包含示例输出的第一个答案。

首先,这是tracer.c

#include <unistd.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/ptrace.h>
#include <sys/prctl.h>
#include <sys/wait.h>
#include <sys/user.h>
#include <dirent.h>
#include <string.h>
#include <signal.h>
#include <errno.h>
#include <stdio.h>
#ifndef   SINGLESTEPS
#define   SINGLESTEPS 10
#endif

/* Similar to getline(), except gets process pid task IDs.
 * Returns positive (number of TIDs in list) if success,
 * otherwise 0 with errno set. */
size_t get_tids(pid_t **const listptr, size_t *const sizeptr, const pid_t pid)
{
    char     dirname[64];
    DIR     *dir;
    pid_t   *list;
    size_t   size, used = 0;

    if (!listptr || !sizeptr || pid < (pid_t)1) {
        errno = EINVAL;
        return (size_t)0;
    }

    if (*sizeptr > 0) {
        list = *listptr;
        size = *sizeptr;
    } else {
        list = *listptr = NULL;
        size = *sizeptr = 0;
    }

    if (snprintf(dirname, sizeof dirname, "/proc/%d/task/", (int)pid) >= (int)sizeof dirname) {
        errno = ENOTSUP;
        return (size_t)0;
    }

    dir = opendir(dirname);
    if (!dir) {
        errno = ESRCH;
        return (size_t)0;
    }

    while (1) {
        struct dirent *ent;
        int            value;
        char           dummy;

        errno = 0;
        ent = readdir(dir);
        if (!ent)
            break;

        /* Parse TIDs. Ignore non-numeric entries. */
        if (sscanf(ent->d_name, "%d%c", &value, &dummy) != 1)
            continue;

        /* Ignore obviously invalid entries. */
        if (value < 1)
            continue;

        /* Make sure there is room for another TID. */
        if (used >= size) {
            size = (used | 127) + 128;
            list = realloc(list, size * sizeof list[0]);
            if (!list) {
                closedir(dir);
                errno = ENOMEM;
                return (size_t)0;
            }
            *listptr = list;
            *sizeptr = size;
        }

        /* Add to list. */
        list[used++] = (pid_t)value;
    }
    if (errno) {
        const int saved_errno = errno;
        closedir(dir);
        errno = saved_errno;
        return (size_t)0;
    }
    if (closedir(dir)) {
        errno = EIO;
        return (size_t)0;
    }

    /* None? */
    if (used < 1) {
        errno = ESRCH;
        return (size_t)0;
    }

    /* Make sure there is room for a terminating (pid_t)0. */
    if (used >= size) {
        size = used + 1;
        list = realloc(list, size * sizeof list[0]);
        if (!list) {
            errno = ENOMEM;
            return (size_t)0;
        }
        *listptr = list;
        *sizeptr = size;
    }

    /* Terminate list; done. */
    list[used] = (pid_t)0;
    errno = 0;
    return used;
}


static int wait_process(const pid_t pid, int *const statusptr)
{
    int   status;
    pid_t p;

    do {
        status = 0;
        p = waitpid(pid, &status, WUNTRACED | WCONTINUED);
    } while (p == (pid_t)-1 && errno == EINTR);
    if (p != pid)
        return errno = ESRCH;

    if (statusptr)
        *statusptr = status;

    return errno = 0;
}

static int continue_process(const pid_t pid, int *const statusptr)
{
    int   status;
    pid_t p;

    do {

        if (kill(pid, SIGCONT) == -1)
            return errno = ESRCH;

        do {
            status = 0;
            p = waitpid(pid, &status, WUNTRACED | WCONTINUED);
        } while (p == (pid_t)-1 && errno == EINTR);

        if (p != pid)
            return errno = ESRCH;

    } while (WIFSTOPPED(status));

    if (statusptr)
        *statusptr = status;

    return errno = 0;
}

void show_registers(FILE *const out, pid_t tid, const char *const note)
{
    struct user_regs_struct regs;
    long                    r;

    do {
        r = ptrace(PTRACE_GETREGS, tid, &regs, &regs);
    } while (r == -1L && errno == ESRCH);
    if (r == -1L)
        return;

#if (defined(__x86_64__) || defined(__i386__)) && __WORDSIZE == 64
    if (note && *note)
        fprintf(out, "Task %d: RIP=0x%016lx, RSP=0x%016lx. %s\n", (int)tid, regs.rip, regs.rsp, note);
    else
        fprintf(out, "Task %d: RIP=0x%016lx, RSP=0x%016lx.\n", (int)tid, regs.rip, regs.rsp);
#elif (defined(__x86_64__) || defined(__i386__)) && __WORDSIZE == 32
    if (note && *note)
        fprintf(out, "Task %d: EIP=0x%08lx, ESP=0x%08lx. %s\n", (int)tid, regs.eip, regs.esp, note);
    else
        fprintf(out, "Task %d: EIP=0x%08lx, ESP=0x%08lx.\n", (int)tid, regs.eip, regs.esp);
#endif
}


int main(int argc, char *argv[])
{
    pid_t *tid = 0;
    size_t tids = 0;
    size_t tids_max = 0;
    size_t t, s;
    long   r;

    pid_t child;
    int   status;

    if (argc < 2 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv[0]);
        fprintf(stderr, "       %s COMMAND [ ARGS ... ]\n", argv[0]);
        fprintf(stderr, "\n");
        fprintf(stderr, "This program executes COMMAND in a child process,\n");
        fprintf(stderr, "and waits for it to stop (via a SIGSTOP signal).\n");
        fprintf(stderr, "When that occurs, the register state of each thread\n");
        fprintf(stderr, "is dumped to standard output, then the child process\n");
        fprintf(stderr, "is sent a SIGCONT signal.\n");
        fprintf(stderr, "\n");
        return 1;
    }

    child = fork();
    if (child == (pid_t)-1) {
        fprintf(stderr, "fork() failed: %s.\n", strerror(errno));
        return 1;
    }

    if (!child) {
        prctl(PR_SET_DUMPABLE, (long)1);
        prctl(PR_SET_PTRACER, (long)getppid());
        fflush(stdout);
        fflush(stderr);
        execvp(argv[1], argv + 1);
        fprintf(stderr, "%s: %s.\n", argv[1], strerror(errno));
        return 127;
    }

    fprintf(stderr, "Tracer: Waiting for child (pid %d) events.\n\n", (int)child);
    fflush(stderr);

    while (1) {

        /* Wait for a child event. */
        if (wait_process(child, &status))
            break;

        /* Exited? */
        if (WIFEXITED(status) || WIFSIGNALED(status)) {
            errno = 0;
            break;
        }

        /* At this point, only stopped events are interesting. */
        if (!WIFSTOPPED(status))
            continue;

        /* Obtain task IDs. */
        tids = get_tids(&tid, &tids_max, child);
        if (!tids)
            break;

        printf("Process %d has %d tasks,", (int)child, (int)tids);
        fflush(stdout);

        /* Attach to all tasks. */
        for (t = 0; t < tids; t++) {
            do {
                r = ptrace(PTRACE_ATTACH, tid[t], (void *)0, (void *)0);
            } while (r == -1L && (errno == EBUSY || errno == EFAULT || errno == ESRCH));
            if (r == -1L) {
                const int saved_errno = errno;
                while (t-->0)
                    do {
                        r = ptrace(PTRACE_DETACH, tid[t], (void *)0, (void *)0);
                    } while (r == -1L && (errno == EBUSY || errno == EFAULT || errno == ESRCH));
                tids = 0;
                errno = saved_errno;
                break;
            }
        }
        if (!tids) {
            const int saved_errno = errno;
            if (continue_process(child, &status))
                break;
            printf(" failed to attach (%s).\n", strerror(saved_errno));
            fflush(stdout);
            if (WIFCONTINUED(status))
                continue;
            errno = 0;
            break;
        }

        printf(" attached to all.\n\n");
        fflush(stdout);

        /* Dump the registers of each task. */
        for (t = 0; t < tids; t++)
            show_registers(stdout, tid[t], "");
        printf("\n");
        fflush(stdout);

        for (s = 0; s < SINGLESTEPS; s++) {
            do {
                r = ptrace(PTRACE_SINGLESTEP, tid[tids-1], (void *)0, (void *)0);
            } while (r == -1L && errno == ESRCH);
            if (!r) {
                for (t = 0; t < tids - 1; t++)
                    show_registers(stdout, tid[t], "");
                show_registers(stdout, tid[tids-1], "Advanced by one step.");
                printf("\n");
                fflush(stdout);
            } else {
                fprintf(stderr, "Single-step failed: %s.\n", strerror(errno));
                fflush(stderr);
            }
        }

        /* Detach from all tasks. */
        for (t = 0; t < tids; t++)
            do {
                r = ptrace(PTRACE_DETACH, tid[t], (void *)0, (void *)0);
            } while (r == -1 && (errno == EBUSY || errno == EFAULT || errno == ESRCH));
        tids = 0;
        if (continue_process(child, &status))
            break;
        if (WIFCONTINUED(status)) {
            printf("Detached. Waiting for new stop events.\n\n");
            fflush(stdout);
            continue;
        }
        errno = 0;
        break;
    }
    if (errno)
        fprintf(stderr, "Tracer: Child lost (%s)\n", strerror(errno));
    else
    if (WIFEXITED(status))
        fprintf(stderr, "Tracer: Child exited (%d)\n", WEXITSTATUS(status));
    else
    if (WIFSIGNALED(status))
        fprintf(stderr, "Tracer: Child died from signal %d\n", WTERMSIG(status));
    else
        fprintf(stderr, "Tracer: Child vanished\n");
    fflush(stderr);

    return status;
}

tracer.c执行指定的命令,等待命令接收SIGSTOP信号。 (tracer.c不会自行发送;您可以让tracee自行停止,也可以从外部发送信号。)

当命令已停止时,tracer.c附加一个ptrace的每个线程,和单步的线程之一的步骤的固定数量(SINGLESTEPS编译时间常数),示出了相关的寄存器每个线程的状态。

之后,它与命令分离,并向其发送SIGCONT信号,让它继续正常运行。

这是一个简单的测试程序,worker.c,我用于测试:

#include <pthread.h>
#include <signal.h>
#include <string.h>
#include <errno.h>
#include <stdio.h>

#ifndef   THREADS
#define   THREADS  2
#endif

volatile sig_atomic_t   done = 0;

void catch_done(int signum)
{
    done = signum;
}

int install_done(const int signum)
{
    struct sigaction act;

    sigemptyset(&act.sa_mask);
    act.sa_handler = catch_done;
    act.sa_flags = 0;
    if (sigaction(signum, &act, NULL))
        return errno;
    else
        return 0;
}

void *worker(void *data)
{
    volatile unsigned long *const counter = data;

    while (!done)
        __sync_add_and_fetch(counter, 1UL);

    return (void *)(unsigned long)__sync_or_and_fetch(counter, 0UL);
}

int main(void)
{
    unsigned long   counter = 0UL;
    pthread_t       thread[THREADS];
    pthread_attr_t  attrs;
    size_t          i;

    if (install_done(SIGHUP) ||
        install_done(SIGTERM) ||
        install_done(SIGUSR1)) {
        fprintf(stderr, "Worker: Cannot install signal handlers: %s.\n", strerror(errno));
        return 1;
    }

    pthread_attr_init(&attrs);
    pthread_attr_setstacksize(&attrs, 65536);
    for (i = 0; i < THREADS; i++)
        if (pthread_create(&thread[i], &attrs, worker, &counter)) {
            done = 1;
            fprintf(stderr, "Worker: Cannot create thread: %s.\n", strerror(errno));
            return 1;
        }
    pthread_attr_destroy(&attrs);

    /* Let the original thread also do the worker dance. */
    worker(&counter);

    for (i = 0; i < THREADS; i++)
        pthread_join(thread[i], NULL);

    return 0;
}

使用例如

编译
gcc -W -Wall -O3 -fomit-frame-pointer worker.c -pthread -o worker
gcc -W -Wall -O3 -fomit-frame-pointer tracer.c -o tracer

并使用例如

在单独的终端或后台运行
./tracer ./worker &

跟踪器显示工人的PID:

Tracer: Waiting for child (pid 24275) events.

此时,孩子正常运行。当您向孩子发送SIGSTOP时,操作就会开始。跟踪器检测到它,进行所需的跟踪,然后分离并让孩子继续正常:

kill -STOP 24275

Process 24275 has 3 tasks, attached to all.

Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428.
Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8.
Task 24277: RIP=0x0000000000400a5d, RSP=0x00007f399cfa6ee8.

Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428.
Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8.
Task 24277: RIP=0x0000000000400a5d, RSP=0x00007f399cfa6ee8. Advanced by one step.

Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428.
Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8.
Task 24277: RIP=0x0000000000400a63, RSP=0x00007f399cfa6ee8. Advanced by one step.

Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428.
Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8.
Task 24277: RIP=0x0000000000400a65, RSP=0x00007f399cfa6ee8. Advanced by one step.

Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428.
Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8.
Task 24277: RIP=0x0000000000400a58, RSP=0x00007f399cfa6ee8. Advanced by one step.

Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428.
Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8.
Task 24277: RIP=0x0000000000400a5d, RSP=0x00007f399cfa6ee8. Advanced by one step.

Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428.
Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8.
Task 24277: RIP=0x0000000000400a63, RSP=0x00007f399cfa6ee8. Advanced by one step.

Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428.
Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8.
Task 24277: RIP=0x0000000000400a65, RSP=0x00007f399cfa6ee8. Advanced by one step.

Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428.
Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8.
Task 24277: RIP=0x0000000000400a58, RSP=0x00007f399cfa6ee8. Advanced by one step.

Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428.
Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8.
Task 24277: RIP=0x0000000000400a5d, RSP=0x00007f399cfa6ee8. Advanced by one step.

Task 24275: RIP=0x0000000000400a5d, RSP=0x00007fff6895c428.
Task 24276: RIP=0x0000000000400a5d, RSP=0x00007f399cfb7ee8.
Task 24277: RIP=0x0000000000400a63, RSP=0x00007f399cfa6ee8. Advanced by one step.

Detached. Waiting for new stop events.

您可以根据需要重复上述操作。请注意,我挑选的SIGSTOP信号作为触发,因为tracer.c这样,也作为用于产生复杂的多线程核心的基础有用每个请求转储(作为多线程进程可以通过简单地将自身发送一个触发它SIGSTOP)。

在上面的例子中,线程都在旋转的worker()函数的反汇编:

0x400a50: eb 0b                 jmp          0x400a5d
0x400a52: 66 0f 1f 44 00 00     nopw         0x0(%rax,%rax,1)
0x400a58: f0 48 83 07 01        lock addq    $0x1,(%rdi)          = fourth step
0x400a5d: 8b 05 00 00 00 00     mov          0x0(%rip),%eax       = first step
0x400a63: 85 c0                 test         %eax,%eax            = second step
0x400a65: 74 f1                 je           0x400a58             = third step
0x400a67: 48 8b 07              mov          (%rdi),%rax
0x400a6a: 48 89 c2              mov          %rax,%rdx
0x400a6d: f0 48 0f b1 07        lock cmpxchg %rax,(%rdi)
0x400a72: 75 f6                 jne          0x400a6a
0x400a74: 48 89 d0              mov          %rdx,%rax
0x400a77: c3                    retq

现在,这个测试程序只显示如何停止进程,附加到其所有线程,单步执行其中一个线程所需数量的指令,然后让所有线程继续正常;它尚未证明同样适用于让特定线程正常继续(通过PTRACE_CONT)。但是,我在下面描述的细节向我表明,同样的方法应该适用于PTRACE_CONT

我在编写上述测试程序时遇到的主要问题或惊喜是

的必要性
long r;

do {
    r = ptrace(PTRACE_cmd, tid, ...);
} while (r == -1L && (errno == EBUSY || errno == EFAULT || errno == ESRCH));

循环,尤其适用于ESRCH案例(由于ptrace man page说明而我仅添加的其他案例)。

您可以看到,大多数ptrace命令仅在任务停止时允许。但是,当任务仍在完成时,任务不会停止。单步命令。因此,使用上面的循环 - 也许将一毫秒了nanosleep或相似,以避免浪费CPU - 确保先前ptrace的命令已完成(因此任务停止)之前,我们尝试提供新的

Kerrek SB,我相信至少你在测试程序中遇到的一些麻烦是由于这个问题?对我个人来说,这是一个 D&#39;哦!的时刻,当然这是必要的,因为ptracing本质上是异步的,而不是同步的。

(这种异步性也是我上面提到的SIGCONT - PTRACE_CONT互动的原因。我相信使用上面显示的循环进行适当的处​​理,这种互动不再是问题 - 而且实际上是可以理解的。)


添加对此答案的评论:

Linux内核在task_struct结构中使用一组任务状态标志(有关定义,请参阅include/linux/sched.h)以跟踪每个任务的状态。 ptrace()中面向用户空间的一方在kernel/ptrace.c中定义。

调用PTRACE_SINGLESTEPPTRACE_CONT时,kernel/ptrace.c:ptrace_continue()会处理大部分细节。完成后,请致电wake_up_state(child, __TASK_TRACED)kernel/sched/core.c::try_to_wake_up(child, __TASK_TRACED, 0))。

当通过SIGSTOP信号停止进程时,所有任务都将停止,最终进入&#34;已停止,未跟踪&#34; 状态。

附加到每个任务(通过PTRACE_ATTACH或PTRACE_SEIZE,请参阅kernel/ptrace.c:ptrace_attach())修改任务状态。但是,ptrace状态位(请参阅include/linux/ptrace.h:PT_ constants)与任务可运行状态位分开(请参阅include/linux/sched.h:TASK_ constants)。

在附加到任务并向进程发送SIGCONT信号后,停止状态不会立即被修改(我相信),因为任务也被跟踪。执行PTRACE_SINGLESTEP或PTRACE_CONT会在kernel/sched/core.c::try_to_wake_up(child, __TASK_TRACED, 0)中结束,这会更新任务状态,并将任务移动到运行队列。

现在,我还没有找到代码路径的复杂部分是在下次调度任务时如何在内核中更新任务状态。我的测试表明,通过单步执行(这是另一个任务状态标志),只有任务状态得到更新,并且单步标志被清除。似乎PTRACE_CONT不那么可靠;我相信这是因为单步旗帜&#34;强迫&#34;任务状态改变。也许有一个&#34;竞争条件&#34; WRT。继续信号传递和状态变化?

(进一步编辑:内核开发人员肯定希望调用wait(),例如参见this thread。)

换句话说,在注意到该过程已停止后(请注意,如果该过程不是孩子,并且尚未附加到,则可以使用/proc/PID/stat/proc/PID/status),我相信以下内容程序是最强大的程序:

pid_t  pid, p; /* Process owning the tasks */
tid_t *tid;    /* Task ID array */
size_t tids;   /* Tasks */
long   result;
int    status;
size_t i;

for (i = 0; i < tids; i++) {
    while (1) {
        result = ptrace(PTRACE_ATTACH, tid[i], (void *)0, (void *)0);
        if (result == -1L && (errno == ESRCH || errno == EBUSY || errno == EFAULT || errno == EIO)) {
            /* To avoid burning up CPU for nothing: */
            sched_yield(); /* or nanosleep(), or usleep() */
            continue;
        }
        break;
    }       
    if (result == -1L) {
        /*
         * Fatal error. First detach from tid[0..i-1], then exit.
        */
    }
}

/* Send SIGCONT to the process. */
if (kill(pid, SIGCONT)) {
    /*
     * Fatal error, see errno. Exit.
    */
}

/* Since we are attached to the process,
 * we can wait() on it. */
while (1) {
    errno = 0;
    status = 0;
    p = waitpid(pid, &status, WCONTINUED);
    if (p == (pid_t)-1) {
        if (errno == EINTR)
            continue;
        else
            break;
    } else
    if (p != pid) {
        errno = ESRCH;
        break;
    } else
    if (WIFCONTINUED(status)) {
        errno = 0;
        break;
    }
}
if (errno) {
    /*
     * Fatal error. First detach from tid[0..tids-1], then exit.
    */
}

/* Single-step each task to update the task states. */
for (i = 0; i < tids; i++) {
    while (1) {
        result = ptrace(PTRACE_SINGLESTEP, tid[i], (void *)0, (void *)0);
        if (result == -1L && errno == ESRCH) {
            /* To avoid burning up CPU for nothing: */
            sched_yield(); /* or nanosleep(), or usleep() */
            continue;
        }
        break;
    }       
    if (result == -1L) {
        /*
         * Fatal error. First detach from tid[0..i-1], then exit.
        */
    }
}

/* Obtain task register structures, to make sure the single-steps
 * have completed and their states have stabilized. */
for (i = 0; i < tids; i++) {
    struct user_regs_struct regs;

    while (1) {
        result = ptrace(PTRACE_GETREGS, tid[i], &regs, &regs);
        if (result == -1L && (errno == ESRCH || errno == EBUSY || errno == EFAULT || errno == EIO)) {
            /* To avoid burning up CPU for nothing: */
            sched_yield(); /* or nanosleep(), or usleep() */
            continue;
        }
        break;
    }       
    if (result == -1L) {
        /*
         * Fatal error. First detach from tid[0..i-1], then exit.
        */
    }
}

在上述之后,所有任务都应附加并处于预期状态,以便例如PTRACE_CONT没有进一步的技巧。

如果未来内核中的行为发生变化 - 我确实认为STOP / CONT信号和ptracing之间的相互作用可能会发生变化;至少向LKML开发人员提出有关此行为的问题是有道理的! - ,上述程序仍然可以正常运行。 (谨慎一点,通过使用PTRACE_SINGLESTEP循环几次,也可能是一个好主意。)

与PTRACE_CONT的区别在于,如果行为在将来发生变化,则初始PTRACE_CONT可能实际上继续该过程,导致其后面的ptrace()失败。使用PTRACE_SINGLESTEP,进程停止,允许进一步的ptrace()调用成功。

有问题吗?

答案 1 :(得分:6)

  

我可以附加到特定的帖子吗?

是的,至少在当前的内核上。

  

这是否意味着单步执行仅通过一个线程的指令?它是否会停止所有进程的线程?

是。它不会阻止其他线程,只会阻止其他线程。

  

有没有办法只在一个线程中前进,但保证其他线程保持停止状态?

是。将SIGSTOP发送到进程(使用waitpid(PID,,WUNTRACED)等待进程停止),然后PTRACE_ATTACH发送到进程中的每个线程。发送SIGCONT(使用waitpid(PID,,WCONTINUED)等待进程继续)。

由于所有线程在连接时都已停止,并且附加会停止线程,因此所有线程在传递SIGCONT信号后都会停止。您可以按照自己喜欢的顺序单步执行线程。


我觉得这很有趣,足以引发一个测试用例。 (好吧,实际上我怀疑没有人会接受我的话,所以我决定最好显示你可以自己复制的证据。)

我的系统似乎遵循man 2 ptrace中描述的Linux man-pages project,而Kerrisk似乎非常擅长将它们与内核行为保持同步。一般来说,我更喜欢kernel.org来源wrt。 Linux内核到其他来源。

要点:

  • 附加到进程本身(TID == PID)仅停止原始线程,而不是所有线程。

  • 附加到特定线程(使用来自/proc/PID/task/的TID)会停止该线程。 (换句话说,TID == PID的线程并不特殊。)

  • 向流程发送SIGSTOP将停止所有主题,但ptrace()仍然可以正常运行。

  • 如果您向流程发送了SIGSTOP,请在分离前不要致电ptrace(PTRACE_CONT, TID)PTRACE_CONT似乎会干扰SIGCONT信号。

    您可以先发送SIGSTOP,然后发送PTRACE_ATTACH,然后发送SIGCONT,不会出现任何问题;线程将保持停止(由于ptrace)。换句话说,PTRACE_ATTACHPTRACE_DETACHSIGSTOPSIGCONT混合良好,没有任何副作用。

  • SIGSTOPSIGCONT会影响整个过程,即使您尝试使用tgkill()(或pthread_kill())将信号发送到特定主题。< / p>

  • 停止并继续特定主题PTHREAD_ATTACH;停止并继续进程的所有线程,分别向进程发送SIGSTOPSIGCONT信号。

就个人而言,我相信这证实了我在另一个问题中提出的方法。

以下是您可以编译并运行以便自己测试的丑陋测试代码traces.c

#define  GNU_SOURCE
#include <stdlib.h>
#include <unistd.h>
#include <sys/wait.h>
#include <sys/ptrace.h>
#include <sys/syscall.h>
#include <dirent.h>
#include <pthread.h>
#include <signal.h>
#include <string.h>
#include <errno.h>
#include <stdio.h>

#ifndef   THREADS
#define   THREADS  3
#endif

static int tgkill(int tgid, int tid, int sig)
{
    int retval;

    retval = syscall(SYS_tgkill, tgid, tid, sig);
    if (retval < 0) {
        errno = -retval;
        return -1;
    }

    return 0;
}

volatile unsigned long counter[THREADS + 1] = { 0UL };

volatile sig_atomic_t run = 0;
volatile sig_atomic_t done = 0;

void handle_done(int signum)
{
    done = signum;
}

int install_done(int signum)
{
    struct sigaction act;
    sigemptyset(&act.sa_mask);
    act.sa_handler = handle_done;
    act.sa_flags = 0;
    if (sigaction(signum, &act, NULL))
        return errno;
    return 0;
}

void *worker(void *data)
{
    volatile unsigned long *const counter = data;

    while (!run)
        ;

    while (!done)
        (*counter)++;

    return (void *)(*counter);
}

pid_t *gettids(const pid_t pid, size_t *const countptr)
{
    char           dirbuf[128];
    DIR           *dir;
    struct dirent *ent;

    pid_t         *data = NULL, *temp;
    size_t         size = 0;
    size_t         used = 0;

    int            tid;
    char           dummy;

    if ((int)pid < 2) {
        errno = EINVAL;
        return NULL;
    }

    if (snprintf(dirbuf, sizeof dirbuf, "/proc/%d/task/", (int)pid) >= (int)sizeof dirbuf) {
        errno = ENAMETOOLONG;
        return NULL;
    }

    dir = opendir(dirbuf);
    if (!dir)
        return NULL;

    while (1) {
        errno = 0;
        ent = readdir(dir);
        if (!ent)
            break;

        if (sscanf(ent->d_name, "%d%c", &tid, &dummy) != 1)
            continue;

        if (tid < 2)
            continue;

        if (used >= size) {
            size = (used | 127) + 129;
            temp = realloc(data, size * sizeof data[0]);
            if (!temp) {
                free(data);
                closedir(dir);
                errno = ENOMEM;
                return NULL;
            }
            data = temp;
        }

        data[used++] = (pid_t)tid;
    }
    if (errno) {
        free(data);
        closedir(dir);
        errno = EIO;
        return NULL;
    }
    if (closedir(dir)) {
        free(data);
        errno = EIO;
        return NULL;
    }

    if (used < 1) {
        free(data);
        errno = ENOENT;
        return NULL;
    }

    size = used + 1;
    temp = realloc(data, size * sizeof data[0]);
    if (!temp) {
        free(data);
        errno = ENOMEM;
        return NULL;
    }
    data = temp;

    data[used] = (pid_t)0;

    if (countptr)
        *countptr = used;

    errno = 0;
    return data;
}

int child_main(void)
{
    pthread_t   id[THREADS];
    int         i;

    if (install_done(SIGUSR1)) {
        fprintf(stderr, "Cannot set SIGUSR1 signal handler.\n");
        return 1;
    }

    for (i = 0; i < THREADS; i++)
        if (pthread_create(&id[i], NULL, worker, (void *)&counter[i])) {
            fprintf(stderr, "Cannot create thread %d of %d: %s.\n", i + 1, THREADS, strerror(errno));
            return 1;
        }

    run = 1;

    kill(getppid(), SIGUSR1);

    while (!done)
        counter[THREADS]++;

    for (i = 0; i < THREADS; i++)
        pthread_join(id[i], NULL);

    printf("Final counters:\n");
    for (i = 0; i < THREADS; i++)
        printf("\tThread %d: %lu\n", i + 1, counter[i]);
    printf("\tMain thread: %lu\n", counter[THREADS]);

    return 0;
}

int main(void)
{
    pid_t   *tid = NULL;
    size_t   tids = 0;
    int      i, k;
    pid_t    child, p;

    if (install_done(SIGUSR1)) {
        fprintf(stderr, "Cannot set SIGUSR1 signal handler.\n");
        return 1;
    }

    child = fork();
    if (!child)
        return child_main();

    if (child == (pid_t)-1) {
        fprintf(stderr, "Cannot fork.\n");
        return 1;
    }

    while (!done)
        usleep(1000);

    tid = gettids(child, &tids);
    if (!tid) {
        fprintf(stderr, "gettids(): %s.\n", strerror(errno));
        kill(child, SIGUSR1);
        return 1;
    }

    fprintf(stderr, "Child process %d has %d tasks.\n", (int)child, (int)tids);
    fflush(stderr);

    for (k = 0; k < (int)tids; k++) {
        const pid_t t = tid[k];

        if (ptrace(PTRACE_ATTACH, t, (void *)0L, (void *)0L)) {
            fprintf(stderr, "Cannot attach to TID %d: %s.\n", (int)t, strerror(errno));
            kill(child, SIGUSR1);
            return 1;
        }

        fprintf(stderr, "Attached to TID %d.\n\n", (int)t);

        fprintf(stderr, "Peeking the counters in the child process:\n");
        for (i = 0; i <= THREADS; i++) {
            long v;
            do {
                errno = 0;
                v = ptrace(PTRACE_PEEKDATA, t, &counter[i], NULL);
            } while (v == -1L && (errno == EIO || errno == EFAULT || errno == ESRCH));
            fprintf(stderr, "\tcounter[%d] = %lu\n", i, (unsigned long)v);
        }
        fprintf(stderr, "Waiting a short moment ... ");
        fflush(stderr);

        usleep(250000);

        fprintf(stderr, "and another peek:\n");
        for (i = 0; i <= THREADS; i++) {
            long v;
            do {
                errno = 0;
                v = ptrace(PTRACE_PEEKDATA, t, &counter[i], NULL);
            } while (v == -1L && (errno == EIO || errno == EFAULT || errno == ESRCH));
            fprintf(stderr, "\tcounter[%d] = %lu\n", i, (unsigned long)v);
        }
        fprintf(stderr, "\n");
        fflush(stderr);

        usleep(250000);

        ptrace(PTRACE_DETACH, t, (void *)0L, (void *)0L);
    }

    for (k = 0; k < 4; k++) {
        const pid_t t = tid[tids / 2];

        if (k == 0) {
            fprintf(stderr, "Sending SIGSTOP to child process ... ");
            fflush(stderr);
            kill(child, SIGSTOP);
        } else
        if (k == 1) {
            fprintf(stderr, "Sending SIGCONT to child process ... ");
            fflush(stderr);
            kill(child, SIGCONT);
        } else
        if (k == 2) {
            fprintf(stderr, "Sending SIGSTOP to TID %d ... ", (int)tid[0]);
            fflush(stderr);
            tgkill(child, tid[0], SIGSTOP);
        } else
        if (k == 3) {
            fprintf(stderr, "Sending SIGCONT to TID %d ... ", (int)tid[0]);
            fflush(stderr);
            tgkill(child, tid[0], SIGCONT);
        }
        usleep(250000);
        fprintf(stderr, "done.\n");
        fflush(stderr);

        if (ptrace(PTRACE_ATTACH, t, (void *)0L, (void *)0L)) {
            fprintf(stderr, "Cannot attach to TID %d: %s.\n", (int)t, strerror(errno));
            kill(child, SIGUSR1);
            return 1;
        }

        fprintf(stderr, "Attached to TID %d.\n\n", (int)t);

        fprintf(stderr, "Peeking the counters in the child process:\n");
        for (i = 0; i <= THREADS; i++) {
            long v;
            do {
                errno = 0;
                v = ptrace(PTRACE_PEEKDATA, t, &counter[i], NULL);
            } while (v == -1L && (errno == EIO || errno == EFAULT || errno == ESRCH));
            fprintf(stderr, "\tcounter[%d] = %lu\n", i, (unsigned long)v);
        }
        fprintf(stderr, "Waiting a short moment ... ");
        fflush(stderr);

        usleep(250000);

        fprintf(stderr, "and another peek:\n");
        for (i = 0; i <= THREADS; i++) {
            long v;
            do {
                errno = 0;
                v = ptrace(PTRACE_PEEKDATA, t, &counter[i], NULL);
            } while (v == -1L && (errno == EIO || errno == EFAULT || errno == ESRCH));
            fprintf(stderr, "\tcounter[%d] = %lu\n", i, (unsigned long)v);
        }
        fprintf(stderr, "\n");
        fflush(stderr);

        usleep(250000);

        ptrace(PTRACE_DETACH, t, (void *)0L, (void *)0L);
    }

    kill(child, SIGUSR1);

    do {
        p = waitpid(child, NULL, 0);
        if (p == -1 && errno != EINTR)
            break;
    } while (p != child);

    return 0;
}

使用例如

编译并运行
gcc -DTHREADS=3 -W -Wall -O3 traces.c -pthread -o traces
./traces

输出是子进程计数器的转储(每个进程在一个单独的线程中递增,包括使用最终计数器的原始线程)。比较短暂等待的计数器。例如:

Child process 18514 has 4 tasks.
Attached to TID 18514.

Peeking the counters in the child process:
    counter[0] = 0
    counter[1] = 0
    counter[2] = 0
    counter[3] = 0
Waiting a short moment ... and another peek:
    counter[0] = 18771865
    counter[1] = 6435067
    counter[2] = 54247679
    counter[3] = 0

如上所示,只有使用最终计数器的初始线程(其TID == PID)才会停止。对于其他三个线程也是如此,它们按顺序使用前三个计数器:

Attached to TID 18515.

Peeking the counters in the child process:
    counter[0] = 25385151
    counter[1] = 13459822
    counter[2] = 103763861
    counter[3] = 560872
Waiting a short moment ... and another peek:
    counter[0] = 25385151
    counter[1] = 69116275
    counter[2] = 120500164
    counter[3] = 9027691

Attached to TID 18516.

Peeking the counters in the child process:
    counter[0] = 25397582
    counter[1] = 105905400
    counter[2] = 155895025
    counter[3] = 17306682
Waiting a short moment ... and another peek:
    counter[0] = 32358651
    counter[1] = 105905400
    counter[2] = 199601078
    counter[3] = 25023231

Attached to TID 18517.

Peeking the counters in the child process:
    counter[0] = 40600813
    counter[1] = 111675002
    counter[2] = 235428637
    counter[3] = 32298929
Waiting a short moment ... and another peek:
    counter[0] = 48727731
    counter[1] = 143870702
    counter[2] = 235428637
    counter[3] = 39966259

接下来的两个案例会检查SIGCONT / SIGSTOP wrt。整个过程:

Sending SIGSTOP to child process ... done.
Attached to TID 18516.

Peeking the counters in the child process:
    counter[0] = 56887263
    counter[1] = 170646440
    counter[2] = 235452621
    counter[3] = 48077803
Waiting a short moment ... and another peek:
    counter[0] = 56887263
    counter[1] = 170646440
    counter[2] = 235452621
counter[3] = 48077803

Sending SIGCONT to child process ... done.
Attached to TID 18516.

Peeking the counters in the child process:
    counter[0] = 64536344
    counter[1] = 182359343
    counter[2] = 253660731
    counter[3] = 56422231
Waiting a short moment ... and another peek:
    counter[0] = 72029244
    counter[1] = 182359343
    counter[2] = 288014365
    counter[3] = 63797618

如您所见,发送SIGSTOP将停止所有主题,但不会阻止ptrace()。同样,在SIGCONT之后,线程继续正常运行。

最后两个案例检查了使用tgkill()SIGSTOP / SIGCONT发送到特定线程(与第一个计数器对应的线程),同时附加到另一个线程的效果螺纹:

Sending SIGSTOP to TID 18514 ... done.
Attached to TID 18516.

Peeking the counters in the child process:
    counter[0] = 77012930
    counter[1] = 183059526
    counter[2] = 344043770
    counter[3] = 71120227
Waiting a short moment ... and another peek:
    counter[0] = 77012930
    counter[1] = 183059526
    counter[2] = 344043770
    counter[3] = 71120227

Sending SIGCONT to TID 18514 ... done.
Attached to TID 18516.

Peeking the counters in the child process:
    counter[0] = 88082419
    counter[1] = 194059048
    counter[2] = 359342314
    counter[3] = 84887463
Waiting a short moment ... and another peek:
    counter[0] = 100420161
    counter[1] = 194059048
    counter[2] = 392540525
    counter[3] = 111770366

不幸的是,正如您所见,正如预期的那样,处理(停止/运行)是整个流程,而不是特定于线程的。这意味着要停止特定线程并让其他线程正常运行,您需要单独PTHREAD_ATTACH到您希望停止的线程。

为证明上述所有陈述,您可能需要添加测试用例;我最终得到了相当多的代码副本,都进行了轻微编辑,以测试所有代码,我不确定我选择了最完整的代码。如果您发现遗漏,我很乐意扩展测试计划。

有问题吗?

答案 2 :(得分:2)

过程中的每个线程都是单独跟踪的(并且每个线程都可以通过不同的跟踪过程进行跟踪,或者不进行跟踪)。当您调用ptrace attach时,您始终只附加到单个线程。只有那个线程会被停止 - 其他线程将继续按原样运行。

ptrace()手册页的最新版本非常清楚:

  

附件和后续命令是每个线程:在多线程中   进程中,每个线程都可以单独附加到一个(可能是   不同的跟踪器,或者没有附加,因此没有调试。   因此,“tracee”总是意味着“(一个)线程”,而不是“a(可能)   多线程)进程“.Ptrace命令总是被发送到   使用表格调用

的特定跟踪
ptrace(PTRACE_foo, pid, ...)
     

其中pid是相应Linux线程的线程ID。

     

(请注意,在此页面中,“多线程进程”表示线程   由使用clone(2)创建的线程组成的组   CLONE_THREAD标志。)

单步执行仅影响您指向它的线程。如果其他线程正在运行,则它们继续运行,如果它们处于跟踪停止状态,则它们将停留在跟踪停止状态。 (这意味着如果您单步执行的线程尝试获取由另一个非运行线程持有的互斥锁或类似同步资源,则它将无法获取该互斥锁。)

如果要在单步执行一个线程时停止进程的所有线程,则需要附加到所有线程。还有一个复杂的问题是,如果在您尝试连接过程时进程正在运行,则可以在枚举时创建新线程。

答案 3 :(得分:-2)

  

是否会停止所有进程的线程?

是 它跟踪进程,此进程的所有线程都停止。 想象一下,你怎么能在IDE中看到不同的线程。

来自手册:

  

ptrace()系统调用提供了一种方法,通过该方法,一个进程(“跟踪器”)可以观察并控制另一个进程的执行(“tracee”)

要附加的示例代码:

printf("Attaching to process %d\n",Tpid);
if ((ptrace(PTRACE_ATTACH, Tpid, 0, 0)) != 0) {;
    printf("Attach result %d\n",res);
}

所以是的,你是一个线程,是的,它会停止进程的所有线程。

if ((res = ptrace(PTRACE_SINGLESTEP, Tpid, 0, signo)) < 0) {
perror("Ptrace singlestep error");
exit(1);
}
res = wait(&stat);

也许可以在这里看到:http://www.secretmango.com/jimb/Whitepapers/ptrace/ptrace.html