Question

我有一个应用程序，我很确定＆＃39;泄漏＆＃39;线程忘记在它们上面调用pthread_join。因此，它们的堆栈不会被清除，并且随着时间的推移，该过程会消耗大量的虚拟地址空间。

有没有办法在软件中找到创建这些线程的位置，或者至少在退出之前找出这些线程正在做什么？

我的应用程序很大，并创建了很多正确连接的线程。所以捕获所有pthread操作是不切实际的。我需要更精确的东西。

我能够想出一个我认为正在发生的事情的独立复制品。

#include <pthread.h>
#include <unistd.h>

void* worker (void* unusued)
{
    // Do nothing
}

int main()
{
    pthread_t thread_id;

    for(int i=0; i < 2000; i++)
    {
            pthread_create(&thread_id, NULL, &worker, NULL);
    }
    sleep(1000);
    return 0;
}

运行后，＆＃39; top＆＃39;显示消耗了16GB的虚拟地址空间

但是＆＃39; ps＆＃39;和＆＃39; gdb＆＃39;只显示一个帖子

我的应用程序中包含所有内容的来源。所以我可以添加任何所需的代码或其他工具。

换句话说，上述应用程序的运行实例如何发现它已经丢失了2000年。线程以及如何找出他们执行的worker（）函数？

Answer 1

好问题。一个可能的答案是使用libpthread插入器。请参阅this article。

让我们让你的测试程序更有趣一点，所以它只“泄漏”了几个线程，并加入了大部分线程：

#include <pthread.h>
#include <unistd.h>

void* worker(void* unusued)
{
  // Do nothing
}

int main()
{
  pthread_t thread_id;

  for (int i = 0; i < 10; i++) {
    pthread_create(&thread_id, NULL, &worker, (void*)i);
    if (i != 4 && i != 7) pthread_join(thread_id, NULL);
  }
  sleep(1000);
  return 0;
}

现在让我们为pthread_create和pthread_join构建一个内插器：

#include <assert.h>
#include <dlfcn.h>
#include <pthread.h>
#include <map>

static pthread_mutex_t mtx;
typedef std::pair<void *, void *> elem_t;
typedef std::map<pthread_t, elem_t> map_t;
static map_t thr_map;

extern "C"
int pthread_create(pthread_t *tid, const pthread_attr_t *attr,
                   void *(*start_routine)(void*), void *arg)
{
  static __decltype(pthread_create) *real
    = reinterpret_cast<__decltype(pthread_create) *>(dlsym(RTLD_NEXT,
                                                           "pthread_create"));
  int rc = (*real)(tid, attr, start_routine, arg);
  if (rc == 0) {
    pthread_mutex_lock(&mtx);
    thr_map[*tid] = std::make_pair((void*)start_routine, arg);
    pthread_mutex_unlock(&mtx);
  }
  return rc;
}

extern "C"
int pthread_join(pthread_t tid, void **arg)
{
  static __decltype(pthread_join) *real
    = reinterpret_cast<__decltype(pthread_join) *>(dlsym(RTLD_NEXT,
                                                         "pthread_join"));
  int rc = (*real)(tid, arg);
  if (rc == 0) {
    pthread_mutex_lock(&mtx);
    const auto it = thr_map.find(tid);
    assert(it != thr_map.end());
    thr_map.erase(it);
    pthread_mutex_unlock(&mtx);
  }
  return rc;
}

构建它：g++ -g -fPIC -shared -o thr.so thr.cc -ldl -std=c++11并使用它：

LD_PRELOAD=./thr.so ./a.out &
[1] 37057

gdb -q -p 37057

Attaching to process 37057
Reading symbols from /tmp/a.out...done.
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/usr/lib64/libthread_db.so.1".
0x00007f95831a2f3d in nanosleep () at ../sysdeps/unix/syscall-template.S:81
81  ../sysdeps/unix/syscall-template.S: No such file or directory.

(gdb) set print pretty
(gdb) p thr_map
$1 = std::map with 2 elements = {
  [140280106567424] = {
    first = 0x40069d <worker(void*)>,
    second = 0x7
  },
  [140280114960128] = {
    first = 0x40069d <worker(void*)>,
    second = 0x4
  }
}

Voilà：你现在知道哪些线程没有被加入，它们被调用了哪些例程，以及给了它们什么参数。

修改

我的申请是静态链接的

在这种情况下，链接器--wrap=pthread_create和--wrap=pthread_join是您的朋友。文档here。

查找僵尸线程的来源

1 个答案: