Question

我的程序在等待子进程（gzip）完成并花费很长时间这样做时遇到了问题。

在它开始等待之前，它将输入流关闭到gzip，因此这应该会触发它很快终止。我检查了系统，gzip没有消耗任何CPU或等待IO（写入磁盘）。

非常奇怪的是它停止等待的时间......

程序我们在内部使用pthreads。它并排处理4个pthreads。每个线程处理许多工作单元，并且对于每个工作单元，它启动一个新的gzip进程（使用fork()和execve()）来编写结果。当gzip没有终止时，线程会挂起，但当其他线程关闭它们的实例时它会突然终止。

为清楚起见，我设置了一个管道：my program(pthread) --> gzip --> file.gz

我想这可以部分解释为CPU负载。但是当进程被分开几分钟并且由于这个锁定问题整个系统最终只使用4核4时，这似乎不太可能。

启动gzip的代码如下。调用execPipeProcess使得孩子直接写入文件，但是从我的程序中读取。那就是：

execPipeProcess(&process, "gzip", -1, gzFileFd)

有什么建议吗？

typedef struct {
    int processID;
    const char * command;
    int stdin;
    int stdout;
} ChildProcess;


void closeAndWait(ChildProcess * process) {
    if (process->stdin >= 0) {
                stdLog("Closing post process stdin");
                if (close(process->stdin)) {
                exitError(-1,errno, "Failed to close stdin for %s",  process->command);
                }
        }
    if (process->stdout >= 0) {
                stdLog("Closing post process stdin");
                if (close(process->stdout)) {
            exitError(-1,errno, "Failed to close stdout for %s", process->command);
                }
        }

    int status;
        stdLog("waiting on post process %d", process->processID);
    if (waitpid(process->processID, &status, 0) == -1) {
        exitError(-1, errno, "Could not wait for %s", process->command);
    }
        stdLog("post process finished");

    if (!WIFEXITED(status)) exitError(-1, 0, "Command did not exit properly %s", process->command);
    if (WEXITSTATUS(status)) exitError(-1, 0, "Command %s returned %d not 0", process->command, WEXITSTATUS(status));
    process->processID = 0;
}



void execPipeProcess(ChildProcess * process, const char* szCommand, int in, int out) {
    // Expand any args
    wordexp_t words;
    if (wordexp (szCommand, &words, 0)) exitError(-1, 0, "Could not expand command %s\n", szCommand);


    // Runs the command
    char nChar;
    int nResult;

    if (in < 0) {
        int aStdinPipe[2];
        if (pipe(aStdinPipe) < 0) {
            exitError(-1, errno, "allocating pipe for child input redirect failed");
        }
        process->stdin = aStdinPipe[PIPE_WRITE];
        in = aStdinPipe[PIPE_READ];
    }
    else {
        process->stdin = -1;
    }
    if (out < 0) {
        int aStdoutPipe[2];
        if (pipe(aStdoutPipe) < 0) {
            exitError(-1, errno, "allocating pipe for child input redirect failed");
        }
        process->stdout = aStdoutPipe[PIPE_READ];
        out = aStdoutPipe[PIPE_WRITE];
    }
    else {
        process->stdout = -1;
    }

    process->processID = fork();
    if (0 == process->processID) {
        // child continues here

        // these are for use by parent only
        if (process->stdin >= 0) close(process->stdin);
        if (process->stdout >= 0) close(process->stdout);

        // redirect stdin
        if (STDIN_FILENO != in) {
            if (dup2(in, STDIN_FILENO) == -1) {
              exitError(-1, errno, "redirecting stdin failed");
            }
            close(in);
        }

        // redirect stdout
        if (STDOUT_FILENO != out) {
            if (dup2(out, STDOUT_FILENO) == -1) {
              exitError(-1, errno, "redirecting stdout failed");
            }
            close(out);
        }

        // we're done with these; they've been duplicated to STDIN and STDOUT

        // run child process image
        // replace this with any exec* function find easier to use ("man exec")
        nResult = execvp(words.we_wordv[0], words.we_wordv);

        // if we get here at all, an error occurred, but we are in the child
        // process, so just exit
        exitError(-1, errno, "could not run %s", szCommand);
  } else if (process->processID > 0) {
        wordfree(&words);
        // parent continues here

        // close unused file descriptors, these are for child only
        close(in);
        close(out);
        process->command = szCommand;
    } else {
        exitError(-1,errno, "Failed to fork");
    }
}

Answer 1

子进程继承了打开的文件描述符。

每个后续的gzip子进程不仅继承用于与该特定实例通信的管道文件描述符，还继承用于连接到先前子进程实例的管道的文件描述符。

这意味着当主进程执行关闭时，stdin管道仍然打开，因为在一些子进程中有相同管道的其他文件描述符。一旦那些终止管道终于关闭。

快速解决方法是通过设置close-on-exec标志来防止子进程继承用于主进程的管道文件描述符。

由于涉及多个线程，因此应该序列化子进程以防止子进程继承用于另一个子进程的管道fds。

Answer 2

您没有给我们足够的信息以确定，因为答案取决于您如何使用所提供的功能。但是，您的closeAndWait()函数看起来有点可疑。可能有理由认为有问题的子进程在到达stdin的末尾时会退出，但应该对它编写的数据发生什么，甚至可能仍然写入其stdout }？您的子进程可能会挂起，因为它们的标准输出被阻止，并且它们识别它的速度很慢。

我认为这反映了一个设计问题。如果你正在捕捉儿童过程＆＃39;输出，因为你似乎至少支持做，然后在你关闭孩子输入流的父母结束后，你希望父母继续阅读孩子的输出到它的结束，并执行它打算对它进行的任何处理。否则你可能会失去一些（对于表现gzip的孩子而言，这意味着数据已损坏）。如果你将两个流关闭成为终止孩子的过程的一部分，你就不能这样做。

相反，您应该首先关闭孩子的stdin的父母的一端，继续处理其输出直到您到达目的地，然后才尝试收集孩子。如果您愿意，您可以关闭孩子收集该孩子的过程中父母的输出流的一部分。或者，如果你确实想要丢弃孩子的任何剩余输出，那么你应该在关闭输入和关闭输出之间排出其输出流。

为什么关闭管道需要这么长时间来终止子进程？

2 个答案: