我有以下代码:
#!/usr/bin/env python
def do_job(row):
# COMPUTING INTENSIVE OPERATION
sleep(1)
row.append(int(row[0])**2)
# WRITING TO FILE - ATOMICITY ENSURED
semaphore.acquire()
print "Inside semaphore before writing to file: (%s,%s,%s)" % (row[0], row[1], row[2])
csvWriter.writerow(row)
print "Inside semaphore after writing to file"
semaphore.release()
# RETURNING VALUE
return row
def parallel_csv_processing(inputFile, header=["Default", "header", "please", "change"], separator=",", skipRows = 0, cpuCount = 1):
# OPEN FH FOR READING INPUT FILE
inputFH = open(inputFile, "rb")
csvReader = csv.reader(inputFH, delimiter=separator)
# SKIP HEADERS
for skip in xrange(skipRows):
csvReader.next()
# WRITE HEADER TO OUTPUT FILE
csvWriter.writerow(header)
# COMPUTING INTENSIVE OPERATIONS
try:
p = Pool(processes = cpuCount)
# results = p.map(do_job, csvReader, chunksize = 10)
results = p.map_async(do_job, csvReader, chunksize = 10)
except KeyboardInterrupt:
p.close()
p.terminate()
p.join()
# WAIT FOR RESULTS
# results.get()
p.close()
p.join()
# CLOSE FH FOR READING INPUT
inputFH.close()
if __name__ == '__main__':
import csv
from time import sleep
from multiprocessing import Pool
from multiprocessing import cpu_count
from multiprocessing import current_process
from multiprocessing import Semaphore
from pprint import pprint as pp
import calendar
import time
SCRIPT_START_TIME = calendar.timegm(time.gmtime())
inputFile = "input.csv"
outputFile = "output.csv"
semaphore = Semaphore(1)
# OPEN FH FOR WRITING OUTPUT FILE
outputFH = open(outputFile, "wt")
csvWriter = csv.writer(outputFH, lineterminator='\n')
csvWriter.writerow(["before","calling","multiprocessing"])
parallel_csv_processing(inputFile, cpuCount = cpu_count())
csvWriter.writerow(["after","calling","multiprocessing"])
# CLOSE FH FOR WRITING OUTPUT
outputFH.close()
SCRIPT_STOP_TIME = calendar.timegm(time.gmtime())
SCRIPT_DURATION = SCRIPT_STOP_TIME - SCRIPT_START_TIME
print "Script duration: %s seconds" % SCRIPT_DURATION
在终端上运行输出后如下:
Inside semaphore before writing to file: (0,0,0)
Inside semaphore after writing to file
Inside semaphore before writing to file: (1,3,1)
Inside semaphore after writing to file
Inside semaphore before writing to file: (2,6,4)
Inside semaphore after writing to file
Inside semaphore before writing to file: (3,9,9)
Inside semaphore after writing to file
Inside semaphore before writing to file: (4,12,16)
Inside semaphore after writing to file
Inside semaphore before writing to file: (5,15,25)
Inside semaphore after writing to file
Inside semaphore before writing to file: (6,18,36)
Inside semaphore after writing to file
Inside semaphore before writing to file: (7,21,49)
Inside semaphore after writing to file
Inside semaphore before writing to file: (8,24,64)
Inside semaphore after writing to file
Inside semaphore before writing to file: (9,27,81)
Inside semaphore after writing to file
Script duration: 10 seconds
input.csv
的内容如下:
0,0
1,3
2,6
3,9
4,12
5,15
6,18
7,21
8,24
9,27
创建的output.csv
内容如下:
before,calling,multiprocessing
Default,header,please,change
after,calling,multiprocessing
为什么没有任何内容从output.csv
写入parallel_csv_processing
。 do_job
方法?
答案 0 :(得分:1)
你的进程无异常失败 - 特别是在生成的进程中,脚本没有csvWriter的值,因为它们都在一个单独的python解释器中,并且没有运行main() - 这是故意的,你不希望子进程运行main。 do_job()函数只能访问您在map_async()调用中显式传递给它的值,并且您不能传递csvWriter。即使您不确定它是否有效,也不知道主要和多处理创建的进程之间是否共享文件句柄。
在do_job的代码周围试一试/你会看到异常。
def do_job(row):
try:
# COMPUTING INTENSIVE OPERATION
sleep(1)
row.append(int(row[0])**2)
# WRITING TO FILE - ATOMICITY ENSURED
semaphore.acquire()
print "Inside semaphore before writing to file: (%s,%s,%s)" % (row[0], row[1], row[2])
csvWriter.writerow(row)
print "Inside semaphore after writing to file"
semaphore.release()
# RETURNING VALUE
return row
except:
print "exception"
显然在实际代码中应该正确处理异常,但是如果你运行它,你现在会看到每次调用do_job时都会打印异常。
查看多处理文档以获得更多指导 - 在标题" 16.6.1.4下。在进程之间共享状态"在Python 2.7标准库文档中。