使用multiprocessing.Pipe和子流程进行流水线操作

时间:2016-05-04 20:34:14

标签: python pipe subprocess multiprocessing

我正在尝试将bash管道导出到python中并使用多个进程来加速我的管道。所以我有几个子进程产生了mulitprocessing.Pipe连接它们(下面的示例代码)。

但是,我总是在流程结束时到处停止,输出文件丢失了最后一位。看起来像缓冲区死锁或刷新,但无法弄明白。我相信我错过了一些明显的东西,请告诉我:)任何帮助或评论将不胜感激。谢谢

#!/usr/bin/env python
from __future__ import print_function

import os
import sys
import multiprocessing as MP
import subprocess as SP

def usage():
    Usage = "Need /path/to/infile"
    log2err(Usage)
    sys.exit(1)

def log2err(*s):
    print(s, file=sys.stderr)

def pipeWrite(proc, outpipe):
    # ouput only
    global killToken
    while 1:
        s = proc.stdout.read(bufferSize)
        if len(s) == 0:
            break
        #log2err('Write %s: %s' %(os.getpid(), len(s)))
        outpipe.send_bytes(s)

    log2err("Write PID %s: sending kill" % os.getpid())
    outpipe.send_bytes(killToken)
    outpipe.close()

def pipeRead(proc, inpipe):
    # input only
    global killToken
    while 1:
        s = inpipe.recv_bytes(bufferSize)
        #log2err('Read %s: %s' %(os.getpid(), len(s)))
        if s == killToken:
            log2err("Read PID %s: received kill" % os.getpid())
            break
        proc.stdin.write(s)
        proc.stdin.flush()
    # final cleanup 
    proc.stdin.flush()
    proc.stdin.close()
    inpipe.close()

def testRead(infile, outpipe):
    stmt = "cat %s" % infile
    stmt += " | tee >( wc -l 1>&2)" 
    proc = SP.Popen(stmt, shell=True, stdout=SP.PIPE, executable='/bin/bash', bufsize=bufferSize)
    pipeWrite(proc, outpipe)
    proc.stdout.close()
    outpipe.close()
    log2err('testRead is DONE')

def testGzip(inpipe, outpipe):
    stmt = "gzip -c - | tee >(wc -l 1>&2)"
    proc = SP.Popen(stmt, shell=True, stdout=SP.PIPE, stdin=SP.PIPE, executable='/bin/bash', bufsize=bufferSize)
    PR = MP.Process(target=pipeRead, args=(proc, inpipe))
    PW = MP.Process(target=pipeWrite, args=(proc, outpipe))
    PW.start()
    PR.start()
    PR.join()
    proc.stdin.flush()
    proc.stdin.close()
    proc.stdout.flush()
    proc.stdout.close()
    log2err("testGzip PID:%s with Read:%s and Write:%s" %(os.getpid(), PR.pid, PW.pid))
    PW.join()
    log2err('testGzip is DONE')

def testOutput(infile, inpipe):
    stmt = "tee %s.gz | sha512sum - > %s.gz.sha512" % (infile, infile)
    proc = SP.Popen(stmt, shell=True, stdin=SP.PIPE, executable='/bin/bash', bufsize=bufferSize)
    pipeRead(proc, inpipe)
    inpipe.close()
    log2err('outputFinal is DONE')

if __name__ == "__main__" :
    try:
        infile = sys.argv[1]
        if infile in ('-h', '--help'):
            usage() 
    except IndexError:
        usage()

    bufferSize = 256*256
    killToken = "I am done with all of these processes"

    # the curl task stream and the curl checksum and output stream
    ingestRecv, ingestSend = MP.Pipe(False)
    ingestProc = MP.Process(target=testRead, args=(infile, ingestSend))
    # encrypt stream
    workRecv, workSend = MP.Pipe(False)
    workProc = MP.Process(target=testGzip, args=(ingestRecv, workSend))
    # output to file and sha checksum stream
    outputProc = MP.Process(target=testOutput, args=(infile, workRecv))

    ingestProc.start()
    log2err("ingestProc PID:%s" % ingestProc.pid)

    workProc.start()
    log2err("workProc PID:%s" % workProc.pid)

    outputProc.start()
    log2err("outputProc PID:%s" % outputProc.pid)

    ingestProc.join()
    log2err("ingestProc: joined")

    workProc.join()
    log2err("workProc: joined")

    outputProc.join()
    log2err("outputProc: joined")

0 个答案:

没有答案