我遇到了python多处理的问题。
我有一个.csv文件(~10 GB)。我想从中读取数据,转换数据然后将其保存到python搁置文件中。数据转换速度很慢,所以我在考虑多处理。
我创建了两个多进程队列,第一个是保存从csv读取的数据,第二个是在转换后保存数据。这两个队列工作正常;但问题是,在几个小时后,从第二个队列中搁置数据的过程就会停止。
有人遇到同样的问题吗?谢谢你的任何建议!
我的代码在这里:
def read_from_csv(csv_path, csv_queue):
count = 0
fh = open(csv_path, 'r')
for line in fh:
try:
text, ID = line.split('|')
text = text.split()
ID = ID.strip()
data = (ID, text)
csv_queue.put(data, True)
count += 1
n = csv_queue.qsize()
if n > 0 and n % 10000 == 0:
logger.info('Queue size is {}'.format(n))
# Let the function sleep for a while to prevent the queue becoming too long
if n >= 100000:
logger.info('CSV queue sleeps for 100 sec')
time.sleep(100)
if count % 10000 == 0:
logger.info('Read {} lines'.format(count))
except:
pass
logger.info('Done feeding rows into the Queue')
def entities_extraction(csv_queue, shelve_queue):
count = 0
while not csv_queue.empty():
try:
item = csv_queue.get(True, 300)
except:
logger.info('Not able to fetch data from queue. Quitting')
break
# some function here
# result = ......
shelve_queue.put(result, True)
count += 1
n = shelve_queue.qsize()
if count % 10000 == 0:
logger.info('Process {} lines'.format(count))
# Let the function sleep for a while to prevent the queue becoming too long
if n > 100000:
logger.info('Shelve queue sleeps for 100 sec')
time.sleep(100)
def write_to_shelve(shelve_file, csv_queue, shelve_queue):
sh = shelve.open(shelve_file, writeback=True)
count = 0
while True:
if count % 10000 == 0:
logger.info("Written {} records".format(count))
n = shelve_queue.qsize()
if n > 0 and n % 10000 == 0:
logger.info('Shelve queue size is {}'.format(n))
#if count > 50000:
# print('stop')
# break
try:
item = shelve_queue.get(True, 600)
id, token = item
sh[id] = token
count += 1
except:
logger.info("Not able to fetch data from queue. Quitting")
#time.sleep(5)
break
sh.close()
if __name__ == "__main__":
t0 = time.time()
csv_path =
shelve_path =
csv_queue = mp.Manager().Queue()
shelve_queue = mp.Manager().Queue()
process_to_launch = 7
pool = mp.Pool(process_to_launch)
pool.apply(read_from_csv, (csv_path, csv_queue,))
jobs = [pool.apply(entities_extraction, (csv_queue, shelve_queue,)) for i in range(5)]
result = pool.apply(write_to_shelve, (shelve_path, csv_queue, shelve_queue,))
#sh.close()
print(time.time() - t0)
pool.close()
pool.join()
部分日志信息是:
[2017-03-24 20:42:54,562 - shelve_mp.py - ForkPoolWorker-10] - 83 - INFO - Written 2900000 records
[2017-03-24 20:42:54,779 - shelve_mp.py - ForkPoolWorker-3] - 43 - INFO - Read 3070000 lines
[2017-03-24 20:42:55,919 - shelve_mp.py - ForkPoolWorker-3] - 38 - INFO - Queue size is 90000
[2017-03-24 20:42:59,047 - shelve_mp.py - ForkPoolWorker-3] - 43 - INFO - Read 3080000 lines
[2017-03-24 20:43:02,846 - shelve_mp.py - ForkPoolWorker-3] - 38 - INFO - Queue size is 100000
[2017-03-24 20:43:02,847 - shelve_mp.py - ForkPoolWorker-3] - 40 - INFO - CSV queue sleeps for 100 sec
[2017-03-24 20:43:08,426 - shelve_mp.py - ForkPoolWorker-10] - 87 - INFO - Shelve queue size is 80000
### The writing function stopped here
0.04154396057128906
[2017-03-24 20:44:30,812 - shelve_mp.py - ForkPoolWorker-5] - 68 - INFO - Process 500000 lines
[2017-03-24 20:44:32,745 - shelve_mp.py - ForkPoolWorker-4] - 68 - INFO - Process 500000 lines
[2017-03-24 20:44:34,104 - shelve_mp.py - ForkPoolWorker-8] - 68 - INFO - Process 500000 lines
[2017-03-24 20:44:43,091 - shelve_mp.py - ForkPoolWorker-3] - 38 - INFO - Queue size is 80000
[2017-03-24 20:44:44,026 - shelve_mp.py - ForkPoolWorker-9] - 70 - INFO - Shelve queue sleeps for 100 sec
[2017-03-24 20:44:44,028 - shelve_mp.py - ForkPoolWorker-8] - 70 - INFO - Shelve queue sleeps for 100 sec